From 967995b65ec6e1b7fb5fb37255150b70de45b875 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 7 Nov 2024 02:08:04 -0500 Subject: [PATCH 01/92] wip: start importing v1 --- pysr/julia_helpers.py | 2 ++ pysr/juliapkg.json | 3 ++- pysr/sr.py | 40 +++++++++++++++++++++++++++++----------- pysr/test/test.py | 12 +++++------- pysr/test/test_jax.py | 8 ++------ pysr/test/test_torch.py | 10 +++------- 6 files changed, 43 insertions(+), 32 deletions(-) diff --git a/pysr/julia_helpers.py b/pysr/julia_helpers.py index 18d4a6cf3..21822fb8a 100644 --- a/pysr/julia_helpers.py +++ b/pysr/julia_helpers.py @@ -22,6 +22,8 @@ def _escape_filename(filename): """Turn a path into a string with correctly escaped backslashes.""" + if filename is None: + return None str_repr = str(filename) str_repr = str_repr.replace("\\", "\\\\") return str_repr diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json index 6b6e8aceb..2cc137078 100644 --- a/pysr/juliapkg.json +++ b/pysr/juliapkg.json @@ -3,7 +3,8 @@ "packages": { "SymbolicRegression": { "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", - "version": "=0.24.5" + "url": "https://github.com/MilesCranmer/SymbolicRegression.jl", + "rev": "v1.0.0-beta3" }, "Serialization": { "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b", diff --git a/pysr/sr.py b/pysr/sr.py index 0054ce502..923b656b4 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -356,6 +356,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): different variables, pass a list of complexities to the `fit` method with keyword `complexity_of_variables`. You cannot use both. Default is `1`. + complexity_mapping : str + Alternatively, you can pass a function (a string of Julia code) that + takes the expression as input and returns the complexity. Make sure that + this operates on `AbstractExpression` (and unpacks to `AbstractExpressionNode`), + and returns an integer. + Default is `None`. parsimony : float Multiplicative factor for how much to punish complexity. Default is `0.0032`. @@ -563,8 +569,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): progress : bool Whether to use a progress bar instead of printing to stdout. Default is `True`. - equation_file : str - Where to save the files (.csv extension). + run_id : str + A unique identifier for the run. Will be generated using the + current date and time if not provided. + Default is `None`. + output_directory : str + The base directory to save output files to. Files + will be saved in a subdirectory according to the run ID. + Will be set to `outputs/` if not provided. Default is `None`. temp_equation_file : bool Whether to put the hall of fame file in the temp directory. @@ -734,6 +746,7 @@ def __init__( complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None, complexity_of_constants: Union[int, float] = 1, complexity_of_variables: Optional[Union[int, float]] = None, + complexity_mapping: Optional[str] = None, parsimony: float = 0.0032, dimensional_constraint_penalty: Optional[float] = None, dimensionless_constants_only: bool = False, @@ -790,7 +803,8 @@ def __init__( update_verbosity: Optional[int] = None, print_precision: int = 5, progress: bool = True, - equation_file: Optional[str] = None, + run_id: Optional[str] = None, + output_directory: Optional[str] = None, temp_equation_file: bool = False, tempdir: Optional[str] = None, delete_tempfiles: bool = True, @@ -830,6 +844,7 @@ def __init__( self.complexity_of_operators = complexity_of_operators self.complexity_of_constants = complexity_of_constants self.complexity_of_variables = complexity_of_variables + self.complexity_mapping = complexity_mapping self.parsimony = parsimony self.dimensional_constraint_penalty = dimensional_constraint_penalty self.dimensionless_constants_only = dimensionless_constants_only @@ -890,7 +905,8 @@ def __init__( self.print_precision = print_precision self.progress = progress # - Project management - self.equation_file = equation_file + self.run_id = run_id + self.output_directory = output_directory self.temp_equation_file = temp_equation_file self.tempdir = tempdir self.delete_tempfiles = delete_tempfiles @@ -1029,7 +1045,7 @@ def from_file( assert binary_operators is not None or unary_operators is not None assert n_features_in is not None - # TODO: copy .bkup file if exists. + # TODO: copy .bak file if exists. model = cls( equation_file=str(equation_file), binary_operators=binary_operators, @@ -1458,7 +1474,7 @@ def _validate_and_set_fit_params( elif self.complexity_of_variables is not None: complexity_of_variables = self.complexity_of_variables else: - complexity_of_variables = 1 + complexity_of_variables = None # Data validation and feature name fetching via sklearn # This method sets the n_features_in_ attribute @@ -1519,7 +1535,7 @@ def _pre_transform_training_data( y: ndarray, Xresampled: Union[ndarray, None], variable_names: ArrayLike[str], - complexity_of_variables: Union[int, float, List[Union[int, float]]], + complexity_of_variables: Optional[Union[int, float, List[Union[int, float]]]], X_units: Union[ArrayLike[str], None], y_units: Union[ArrayLike[str], str, None], random_state: np.random.RandomState, @@ -1542,7 +1558,7 @@ def _pre_transform_training_data( variable_names : list[str] Names of each variable in the training dataset, `X`. Of length `n_features`. - complexity_of_variables : int | float | list[int | float] + complexity_of_variables : int | float | list[int | float] | None Complexity of each variable in the training dataset, `X`. X_units : list[str] Units of each variable in the training dataset, `X`. @@ -1790,11 +1806,12 @@ def _run( complexity_of_operators=complexity_of_operators, complexity_of_constants=self.complexity_of_constants, complexity_of_variables=complexity_of_variables, + complexity_mapping=self.complexity_mapping, nested_constraints=nested_constraints, elementwise_loss=custom_loss, loss_function=custom_full_objective, maxsize=int(self.maxsize), - output_file=_escape_filename(self.equation_file_), + output_directory=_escape_filename(self.output_directory), npopulations=int(self.populations), batching=self.batching, batch_size=int(min([batch_size, len(X)]) if self.batching else len(X)), @@ -1908,6 +1925,7 @@ def _run( parallelism=parallelism, saved_state=self.julia_state_, return_state=True, + run_id=self.run_id, addprocs_function=cluster_manager, heap_size_hint_in_bytes=self.heap_size_hint_in_bytes, progress=progress and self.verbosity > 0 and len(y.shape) == 1, @@ -2315,7 +2333,7 @@ def _read_equation_file(self): if self.nout_ > 1: all_outputs = [] for i in range(1, self.nout_ + 1): - cur_filename = str(self.equation_file_) + f".out{i}" + ".bkup" + cur_filename = str(self.equation_file_) + f".out{i}" + ".bak" if not os.path.exists(cur_filename): cur_filename = str(self.equation_file_) + f".out{i}" with open(cur_filename, "r", encoding="utf-8") as f: @@ -2326,7 +2344,7 @@ def _read_equation_file(self): all_outputs.append(df) else: - filename = str(self.equation_file_) + ".bkup" + filename = str(self.equation_file_) + ".bak" if not os.path.exists(filename): filename = str(self.equation_file_) with open(filename, "r", encoding="utf-8") as f: diff --git a/pysr/test/test.py b/pysr/test/test.py index c641e9f66..42ac6fb46 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -449,7 +449,7 @@ def test_load_model(self): for from_backup in [False, True]: rand_dir = Path(tempfile.mkdtemp()) equation_filename = str(rand_dir / "equation.csv") - with open(equation_filename + (".bkup" if from_backup else ""), "w") as f: + with open(equation_filename + (".bak" if from_backup else ""), "w") as f: f.write(csv_file_data) model = PySRRegressor.from_file( equation_filename, @@ -492,7 +492,7 @@ def test_load_model_simple(self): np.testing.assert_allclose(model.predict(self.X), model2.predict(self.X)) # Try again, but using only the pickle file: - for file_to_delete in [str(equation_file), str(equation_file) + ".bkup"]: + for file_to_delete in [str(equation_file), str(equation_file) + ".bak"]: if os.path.exists(file_to_delete): os.remove(file_to_delete) @@ -535,16 +535,14 @@ def manually_create_model(equations, feature_names=None): model.feature_names_in_ = np.array(feature_names, dtype=object) for i in range(model.nout_): equations[i]["complexity loss equation".split(" ")].to_csv( - f"equation_file.csv.out{i+1}.bkup" + f"equation_file.csv.out{i+1}.bak" ) else: model.equation_file_ = "equation_file.csv" model.nout_ = 1 model.selection_mask_ = None model.feature_names_in_ = np.array(feature_names, dtype=object) - equations["complexity loss equation".split(" ")].to_csv( - "equation_file.csv.bkup" - ) + equations["complexity loss equation".split(" ")].to_csv("equation_file.csv.bak") model.refresh() @@ -662,7 +660,7 @@ def test_pickle_with_temp_equation_file(self): equation_file_base = model.equation_file_ for i in range(1, nout + 1): - assert not os.path.exists(str(equation_file_base) + f".out{i}.bkup") + assert not os.path.exists(str(equation_file_base) + f".out{i}.bak") with tempfile.NamedTemporaryFile() as pickle_file: pkl.dump(model, pickle_file) diff --git a/pysr/test/test_jax.py b/pysr/test/test_jax.py index e0237c829..8b79f5060 100644 --- a/pysr/test/test_jax.py +++ b/pysr/test/test_jax.py @@ -46,9 +46,7 @@ def test_pipeline_pandas(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv( - "equation_file.csv.bkup" - ) + equations["Complexity Loss Equation".split(" ")].to_csv("equation_file.csv.bak") model.refresh(checkpoint_file="equation_file.csv") jformat = model.jax() @@ -73,9 +71,7 @@ def test_pipeline(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv( - "equation_file.csv.bkup" - ) + equations["Complexity Loss Equation".split(" ")].to_csv("equation_file.csv.bak") model.refresh(checkpoint_file="equation_file.csv") jformat = model.jax() diff --git a/pysr/test/test_torch.py b/pysr/test/test_torch.py index 8b26f5ca6..af4fb6575 100644 --- a/pysr/test/test_torch.py +++ b/pysr/test/test_torch.py @@ -48,9 +48,7 @@ def test_pipeline_pandas(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv( - "equation_file.csv.bkup" - ) + equations["Complexity Loss Equation".split(" ")].to_csv("equation_file.csv.bak") model.refresh(checkpoint_file="equation_file.csv") tformat = model.pytorch() @@ -81,9 +79,7 @@ def test_pipeline(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv( - "equation_file.csv.bkup" - ) + equations["Complexity Loss Equation".split(" ")].to_csv("equation_file.csv.bak") model.refresh(checkpoint_file="equation_file.csv") @@ -134,7 +130,7 @@ def test_custom_operator(self): ) equations["Complexity Loss Equation".split(" ")].to_csv( - "equation_file_custom_operator.csv.bkup" + "equation_file_custom_operator.csv.bak" ) model.set_params( From 932adf442e74cacf60e23fc194c6feca2a57c537 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 9 Nov 2024 15:49:09 -0500 Subject: [PATCH 02/92] docs: update homepage --- CONTRIBUTORS.md | 4 ++-- README.md | 16 ++++++++-------- docs/backend.md | 6 +++--- docs/examples.md | 2 +- docs/options.md | 2 +- examples/pysr_demo.ipynb | 4 ++-- mkdocs.yml | 2 +- pysr/sr.py | 10 +++++----- 8 files changed, 23 insertions(+), 23 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 409cb7f1c..b7e245b59 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -6,7 +6,7 @@ In this guide you will get an overview of the contribution workflow from opening ## New contributor guide -To get an overview of the project, read PySR's [README](README.md). The [PySR docs](https://astroautomata.com/PySR/) give additional information. +To get an overview of the project, read PySR's [README](README.md). The [PySR docs](https://ai.damtp.cam.ac.uk/pysr/) give additional information. Here are some resources to help you get started with open source contributions in general: - [Finding ways to contribute to open source on GitHub](https://docs.github.com/en/get-started/exploring-projects-on-github/finding-ways-to-contribute-to-open-source-on-github) @@ -39,7 +39,7 @@ Scan through our [existing issues](https://github.com/MilesCranmer/PySR/issues) 2. Create a working branch and start with your changes! 3. (Optional) If you would like to make changes to PySR itself, skip to step 4. However, if you are interested in making changes to the _symbolic regression code_ itself, -check out the [guide](https://astroautomata.com/PySR/backend/) on modifying a custom SymbolicRegression.jl library. +check out the [guide](https://ai.damtp.cam.ac.uk/pysr/backend/) on modifying a custom SymbolicRegression.jl library. In this case, you might instead be interested in making suggestions to the [SymbolicRegression.jl](http://github.com/MilesCranmer/SymbolicRegression.jl) library. 4. You can install your local version of PySR with `python setup.py install`, and run tests with `python -m pysr test main`. diff --git a/README.md b/README.md index aa1801933..70d755cfb 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ https://github.com/MilesCranmer/PySR/assets/7593028/c8511a49-b408-488f-8f18-b174 | **Docs** | **Forums** | **Paper** | **colab demo** | |:---:|:---:|:---:|:---:| -|[![Documentation](https://github.com/MilesCranmer/PySR/actions/workflows/docs.yml/badge.svg)](https://astroautomata.com/PySR/)|[![Discussions](https://img.shields.io/badge/discussions-github-informational)](https://github.com/MilesCranmer/PySR/discussions)|[![Paper](https://img.shields.io/badge/arXiv-2305.01582-b31b1b)](https://arxiv.org/abs/2305.01582)|[![Colab](https://img.shields.io/badge/colab-notebook-yellow)](https://colab.research.google.com/github/MilesCranmer/PySR/blob/master/examples/pysr_demo.ipynb)| +|[![Documentation](https://github.com/MilesCranmer/PySR/actions/workflows/docs.yml/badge.svg)](https://ai.damtp.cam.ac.uk/pysr/)|[![Discussions](https://img.shields.io/badge/discussions-github-informational)](https://github.com/MilesCranmer/PySR/discussions)|[![Paper](https://img.shields.io/badge/arXiv-2305.01582-b31b1b)](https://arxiv.org/abs/2305.01582)|[![Colab](https://img.shields.io/badge/colab-notebook-yellow)](https://colab.research.google.com/github/MilesCranmer/PySR/blob/master/examples/pysr_demo.ipynb)| | **pip** | **conda** | **Stats** | | :---: | :---: | :---: | @@ -20,14 +20,14 @@ https://github.com/MilesCranmer/PySR/assets/7593028/c8511a49-b408-488f-8f18-b174 If you find PySR useful, please cite the paper [arXiv:2305.01582](https://arxiv.org/abs/2305.01582). -If you've finished a project with PySR, please submit a PR to showcase your work on the [research showcase page](https://astroautomata.com/PySR/papers)! +If you've finished a project with PySR, please submit a PR to showcase your work on the [research showcase page](https://ai.damtp.cam.ac.uk/pysr/papers)! **Contents**: - [Why PySR?](#why-pysr) - [Installation](#installation) - [Quickstart](#quickstart) -- [→ Documentation](https://astroautomata.com/PySR) +- [→ Documentation](https://ai.damtp.cam.ac.uk/PySR) - [Contributors](#contributors-)
@@ -254,16 +254,16 @@ model = PySRRegressor.from_file("hall_of_fame.2022-08-10_100832.281.pkl") There are several other useful features such as denoising (e.g., `denoise=True`), feature selection (e.g., `select_k_features=3`). -For examples of these and other features, see the [examples page](https://astroautomata.com/PySR/examples). -For a detailed look at more options, see the [options page](https://astroautomata.com/PySR/options). -You can also see the full API at [this page](https://astroautomata.com/PySR/api). -There are also tips for tuning PySR on [this page](https://astroautomata.com/PySR/tuning). +For examples of these and other features, see the [examples page](https://ai.damtp.cam.ac.uk/pysr/examples). +For a detailed look at more options, see the [options page](https://ai.damtp.cam.ac.uk/pysr/options). +You can also see the full API at [this page](https://ai.damtp.cam.ac.uk/pysr/api). +There are also tips for tuning PySR on [this page](https://ai.damtp.cam.ac.uk/pysr/tuning). ### Detailed Example The following code makes use of as many PySR features as possible. Note that is just a demonstration of features and you should not use this example as-is. -For details on what each parameter does, check out the [API page](https://astroautomata.com/PySR/api/). +For details on what each parameter does, check out the [API page](https://ai.damtp.cam.ac.uk/pysr/api/). ```python model = PySRRegressor( diff --git a/docs/backend.md b/docs/backend.md index b7575d143..092251fbe 100644 --- a/docs/backend.md +++ b/docs/backend.md @@ -33,9 +33,9 @@ The main search code can be found in `src/SymbolicRegression.jl`. Here are some tips: -- The documentation for the backend is given [here](https://astroautomata.com/SymbolicRegression.jl/dev/). +- The documentation for the backend is given [here](https://ai.damtp.cam.ac.uk/symbolicregression/dev/). - Throughout the package, you will often see template functions which typically use a symbol `T` (such as in the string `where {T<:Real}`). Here, `T` is simply the datatype of the input data and stored constants, such as `Float32` or `Float64`. Writing functions in this way lets us write functions generic to types, while still having access to the specific type specified at compilation time. -- Expressions are stored as binary trees, using the `Node{T}` type, described [here](https://astroautomata.com/SymbolicRegression.jl/dev/types/#SymbolicRegression.CoreModule.EquationModule.Node). +- Expressions are stored as binary trees, using the `Node{T}` type, described [here](https://ai.damtp.cam.ac.uk/symbolicregression/dev/types/#SymbolicRegression.CoreModule.EquationModule.Node). - For reference, the main loop itself is found in the `equation_search` function inside [`src/SymbolicRegression.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl). - Parts of the code which are typically edited by users include: - [`src/CheckConstraints.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/CheckConstraints.jl), particularly the function `check_constraints`. This function checks whether a given expression satisfies constraints, such as having a complexity lower than `maxsize`, and whether it contains any forbidden nestings of functions. @@ -70,6 +70,6 @@ For more information on `juliapkg.json`, see [`pyjuliapkg`](https://github.com/J ## Additional notes -If you get comfortable enough with the backend, you might consider using the Julia package directly: the API is given on the [SymbolicRegression.jl documentation](https://astroautomata.com/SymbolicRegression.jl/dev/). +If you get comfortable enough with the backend, you might consider using the Julia package directly: the API is given on the [SymbolicRegression.jl documentation](https://ai.damtp.cam.ac.uk/symbolicregression/dev/). If you make a change that you think could be useful to other users, don't hesitate to open a pull request on either the PySR or SymbolicRegression.jl repositories! Contributions are very appreciated. diff --git a/docs/examples.md b/docs/examples.md index 754875e7c..b666c4102 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -345,7 +345,7 @@ a real number from the loss function). But, you don't need to worry about this, make sure to return a scalar number of type `L`. The `tree` argument is the current expression being evaluated. You can read -about the `tree` fields [here](https://astroautomata.com/SymbolicRegression.jl/stable/types/). +about the `tree` fields [here](https://ai.damtp.cam.ac.uk/symbolicregression/stable/types/). For example, let's fix a symbolic form of an expression, as a rational function. i.e., $P(X)/Q(X)$ for polynomials $P$ and $Q$. diff --git a/docs/options.md b/docs/options.md index 0ccacbcab..7a8f5c11e 100644 --- a/docs/options.md +++ b/docs/options.md @@ -276,7 +276,7 @@ model = PySRRegressor(..., weights=weights, elementwise_loss="myloss(x, y, w) = model.fit(..., weights=weights) ``` -Built-in loss (faster) (see [losses](https://astroautomata.com/SymbolicRegression.jl/dev/losses/)). +Built-in loss (faster) (see [losses](https://ai.damtp.cam.ac.uk/symbolicregression/dev/losses/)). This one computes the L3 norm: ```python diff --git a/examples/pysr_demo.ipynb b/examples/pysr_demo.ipynb index 8822effec..fabf07971 100644 --- a/examples/pysr_demo.ipynb +++ b/examples/pysr_demo.ipynb @@ -321,7 +321,7 @@ "id": "qvgVbOoSFtQY" }, "source": [ - "A full list of operators is given here: https://astroautomata.com/PySR/operators,\n", + "A full list of operators is given here: https://ai.damtp.cam.ac.uk/pysr/operators,\n", "but we can also use any binary or unary operator in `julia`, or define our own as arbitrary functions.\n", "\n", "Say that we want a command to do quartic powers:\n", @@ -1498,7 +1498,7 @@ "id": "S5dO61g1bDhk" }, "source": [ - "The full list of PySR parameters can be found here: https://astroautomata.com/PySR/api" + "The full list of PySR parameters can be found here: https://ai.damtp.cam.ac.uk/pysr/api" ] } ], diff --git a/mkdocs.yml b/mkdocs.yml index cff11241c..c996a0062 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -36,7 +36,7 @@ nav: - interactive-docs.md extra: - homepage: https://astroautomata.com/PySR + homepage: https://ai.damtp.cam.ac.uk/pysr extra_css: - stylesheets/extra.css diff --git a/pysr/sr.py b/pysr/sr.py index 923b656b4..d3296950a 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -76,7 +76,7 @@ def _process_constraints(binary_operators, unary_operators, constraints): "One typical constraint is to use `constraints={..., '^': (-1, 1)}`, which " "will allow arbitrary-complexity base (-1) but only powers such as " "a constant or variable (1). " - "For more tips, please see https://astroautomata.com/PySR/tuning/" + "For more tips, please see https://ai.damtp.cam.ac.uk/pysr/tuning/" ) constraints[op] = (-1, -1) if op in ["plus", "sub", "+", "-"]: @@ -221,7 +221,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Most default parameters have been tuned over several example equations, but you should adjust `niterations`, `binary_operators`, `unary_operators` to your requirements. You can view more detailed explanations of the options - on the [options page](https://astroautomata.com/PySR/options) of the + on the [options page](https://ai.damtp.cam.ac.uk/pysr/options) of the documentation. Parameters @@ -241,7 +241,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): most accurate model. binary_operators : list[str] List of strings for binary operators used in the search. - See the [operators page](https://astroautomata.com/PySR/operators/) + See the [operators page](https://ai.damtp.cam.ac.uk/pysr/operators/) for more details. Default is `["+", "-", "*", "/"]`. unary_operators : list[str] @@ -943,7 +943,7 @@ def __init__( elif k == "julia_project": warnings.warn( "The `julia_project` parameter has been deprecated. To use a custom " - "julia project, please see `https://astroautomata.com/PySR/backend`.", + "julia project, please see `https://ai.damtp.cam.ac.uk/pysr/backend`.", FutureWarning, ) elif k == "julia_kwargs": @@ -2046,7 +2046,7 @@ def fit( if X.shape[0] > 10000 and not self.batching: warnings.warn( "Note: you are running with more than 10,000 datapoints. " - "You should consider turning on batching (https://astroautomata.com/PySR/options/#batching). " + "You should consider turning on batching (https://ai.damtp.cam.ac.uk/pysr/options/#batching). " "You should also reconsider if you need that many datapoints. " "Unless you have a large amount of noise (in which case you " "should smooth your dataset first), generally < 10,000 datapoints " From 063e437087a81ffb26da222950c80cb38f116764 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 9 Nov 2024 15:50:19 -0500 Subject: [PATCH 03/92] deps: bump to 1.0.0-beta4 --- pysr/juliapkg.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json index 2cc137078..3ca980dac 100644 --- a/pysr/juliapkg.json +++ b/pysr/juliapkg.json @@ -4,7 +4,7 @@ "SymbolicRegression": { "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", "url": "https://github.com/MilesCranmer/SymbolicRegression.jl", - "rev": "v1.0.0-beta3" + "rev": "v1.0.0-beta4" }, "Serialization": { "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b", From cfe90f182cbf00f114f21147042489a0922f573e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 9 Nov 2024 16:07:12 -0500 Subject: [PATCH 04/92] fix: avoid deleting temp dir --- pysr/sr.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index d3296950a..3bc4366cc 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -4,7 +4,6 @@ import os import pickle as pkl import re -import shutil import sys import tempfile import warnings @@ -1938,9 +1937,6 @@ def _run( # Set attributes self.equations_ = self.get_hof() - if self.delete_tempfiles: - shutil.rmtree(self.tempdir_) - ALREADY_RAN = True return self From 9733d2a0c56e18717b8be03bbf2cd98028dbd4cf Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 9 Nov 2024 16:31:30 -0500 Subject: [PATCH 05/92] fix: update to new output format --- pysr/sr.py | 91 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 37 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 3bc4366cc..2b36b969d 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -8,7 +8,6 @@ import tempfile import warnings from dataclasses import dataclass, fields -from datetime import datetime from io import StringIO from multiprocessing import cpu_count from pathlib import Path @@ -653,10 +652,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Number of output dimensions. selection_mask_ : ndarray of shape (`n_features_in_`,) Mask of which features of `X` to use when `select_k_features` is set. - tempdir_ : Path + tempdir_ : Optional[Path] Path to the temporary equations directory. - equation_file_ : Union[str, Path] - Output equation file name produced by the julia backend. julia_state_stream_ : ndarray The serialized state for the julia SymbolicRegression.jl backend (after fitting), stored as an array of uint8, produced by Julia's Serialization.serialize function. @@ -717,8 +714,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): y_units_: Union[str, ArrayLike[str], None] nout_: int selection_mask_: Union[NDArray[np.bool_], None] - tempdir_: Path - equation_file_: PathLike + run_id_: str + output_directory_: str julia_state_stream_: Union[NDArray[np.uint8], None] julia_options_stream_: Union[NDArray[np.uint8], None] equation_file_contents_: Union[List[pd.DataFrame], None] @@ -1178,10 +1175,15 @@ def _checkpoint(self): """ # Save model state: self.show_pickle_warnings_ = False - with open(_csv_filename_to_pkl_filename(self.equation_file_), "wb") as f: + with open(self.get_pkl_filename(), "wb") as f: pkl.dump(self, f) self.show_pickle_warnings_ = True + def get_pkl_filename(self) -> Path: + path = Path(self.output_directory_) / self.run_id_ / "checkpoint.pkl" + path.parent.mkdir(parents=True, exist_ok=True) + return path + @property def equations(self): # pragma: no cover warnings.warn( @@ -1258,27 +1260,33 @@ def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]: ) def _setup_equation_file(self): - """ - Set the full pathname of the equation file. - - This is performed using `tempdir` and - `equation_file`. - """ - # Cast tempdir string as a Path object - self.tempdir_ = Path(tempfile.mkdtemp(dir=self.tempdir)) - if self.temp_equation_file: - self.equation_file_ = self.tempdir_ / "hall_of_fame.csv" - elif self.equation_file is None: - if self.warm_start and ( - hasattr(self, "equation_file_") and self.equation_file_ - ): - pass - else: - date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3] - self.equation_file_ = "hall_of_fame_" + date_time + ".csv" + """Set the pathname of the output directory.""" + if self.warm_start and ( + hasattr(self, "run_id_") or hasattr(self, "output_directory_") + ): + assert hasattr(self, "output_directory_") + assert hasattr(self, "run_id_") + if self.run_id is not None: + assert self.run_id_ == self.run_id + if self.output_directory is not None: + assert self.output_directory_ == self.output_directory else: - self.equation_file_ = self.equation_file - self.equation_file_contents_ = None + self.output_directory_ = ( + tempfile.mkdtemp() + if self.temp_equation_file + else ( + "outputs" + if self.output_directory is None + else self.output_directory + ) + ) + self.run_id_ = ( + cast(str, SymbolicRegression.SearchUtilsModule.generate_run_id()) + if self.run_id is None + else self.run_id + ) + if self.temp_equation_file: + assert self.output_directory is None def _validate_and_modify_params(self) -> _DynamicallySetParams: """ @@ -1810,7 +1818,7 @@ def _run( elementwise_loss=custom_loss, loss_function=custom_full_objective, maxsize=int(self.maxsize), - output_directory=_escape_filename(self.output_directory), + output_directory=_escape_filename(self.output_directory_), npopulations=int(self.populations), batching=self.batching, batch_size=int(min([batch_size, len(X)]) if self.batching else len(X)), @@ -1924,7 +1932,7 @@ def _run( parallelism=parallelism, saved_state=self.julia_state_, return_state=True, - run_id=self.run_id, + run_id=self.run_id_, addprocs_function=cluster_manager, heap_size_hint_in_bytes=self.heap_size_hint_in_bytes, progress=progress and self.verbosity > 0 and len(y.shape) == 1, @@ -2123,7 +2131,7 @@ def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None: if checkpoint_file is not None: self.equation_file_ = checkpoint_file self.equation_file_contents_ = None - check_is_fitted(self, attributes=["equation_file_"]) + check_is_fitted(self, attributes=["run_id_", "output_directory_"]) self.equations_ = self.get_hof() def predict(self, X, index=None): @@ -2322,6 +2330,16 @@ def pytorch(self, index=None): else: return best_equation["torch_format"] + def get_equation_file(self, i: Optional[int] = None) -> Path: + if i is not None: + return ( + Path(self.output_directory_) + / self.run_id_ + / f"hall_of_fame_output{i}.csv" + ) + else: + return Path(self.output_directory_) / self.run_id_ / "hall_of_fame.csv" + def _read_equation_file(self): """Read the hall of fame file created by `SymbolicRegression.jl`.""" @@ -2329,20 +2347,18 @@ def _read_equation_file(self): if self.nout_ > 1: all_outputs = [] for i in range(1, self.nout_ + 1): - cur_filename = str(self.equation_file_) + f".out{i}" + ".bak" + cur_filename = str(self.get_equation_file(i)) + ".bak" if not os.path.exists(cur_filename): - cur_filename = str(self.equation_file_) + f".out{i}" + cur_filename = str(self.get_equation_file(i)) with open(cur_filename, "r", encoding="utf-8") as f: buf = f.read() buf = _preprocess_julia_floats(buf) - df = self._postprocess_dataframe(pd.read_csv(StringIO(buf))) - all_outputs.append(df) else: - filename = str(self.equation_file_) + ".bak" + filename = str(self.get_equation_file()) + ".bak" if not os.path.exists(filename): - filename = str(self.equation_file_) + filename = str(self.get_equation_file()) with open(filename, "r", encoding="utf-8") as f: buf = f.read() buf = _preprocess_julia_floats(buf) @@ -2376,7 +2392,8 @@ def get_hof(self): self, attributes=[ "nout_", - "equation_file_", + "run_id_", + "output_directory_", "selection_mask_", "feature_names_in_", ], From d37aa63c22da7003418c558c45f2cd288fb9bcdf Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 17 Nov 2024 13:53:17 +0000 Subject: [PATCH 06/92] deps: set backend version to 1.0.0 --- pysr/juliapkg.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json index 3ca980dac..0c68a4d50 100644 --- a/pysr/juliapkg.json +++ b/pysr/juliapkg.json @@ -3,8 +3,7 @@ "packages": { "SymbolicRegression": { "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", - "url": "https://github.com/MilesCranmer/SymbolicRegression.jl", - "rev": "v1.0.0-beta4" + "version": "1.0.0 - 1.0" }, "Serialization": { "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b", From c6719d16c0566b698c162b6726a244edbce10adb Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 17 Nov 2024 14:44:39 +0000 Subject: [PATCH 07/92] chore: hide outputs dir --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c40841695..5dfa00d5e 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,4 @@ site venv requirements-dev.lock requirements.lock +outputs From 05257dc59126604c065745aa4b9e421a48d95509 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 17 Nov 2024 15:39:22 +0000 Subject: [PATCH 08/92] fix: behavior of loading from checkpoints --- pysr/sr.py | 119 ++++++++++++++++++++++++++-------------------- pysr/test/test.py | 40 +++++++--------- pysr/utils.py | 17 ------- 3 files changed, 83 insertions(+), 93 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 2b36b969d..291ad5a39 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -49,7 +49,6 @@ from .utils import ( ArrayLike, PathLike, - _csv_filename_to_pkl_filename, _preprocess_julia_floats, _safe_check_feature_names_in, _subscriptify, @@ -961,8 +960,9 @@ def __init__( @classmethod def from_file( cls, - equation_file: PathLike, + equation_file=None, *, + run_directory: str, binary_operators: Optional[List[str]] = None, unary_operators: Optional[List[str]] = None, n_features_in: Optional[int] = None, @@ -976,9 +976,10 @@ def from_file( Parameters ---------- - equation_file : str or Path - Path to a pickle file containing a saved model, or a csv file - containing equations. + run_directory : str + The directory containing outputs from a previous run. + This is of the form `[output_directory]/[run_id]`. + Default is `None`. binary_operators : list[str] The same binary operators used when creating the model. Not needed if loading from a pickle file. @@ -1008,68 +1009,73 @@ def from_file( model : PySRRegressor The model with fitted equations. """ + if equation_file is not None: + raise ValueError( + "Passing `equation_file` is deprecated and no longer compatible with " + "the most recent versions of PySR's backend. Please pass `run_directory` " + "instead, which contains all checkpoint files." + ) - pkl_filename = _csv_filename_to_pkl_filename(equation_file) - - # Try to load model from .pkl - print(f"Checking if {pkl_filename} exists...") - if os.path.exists(pkl_filename): - print(f"Loading model from {pkl_filename}") + pkl_filename = Path(run_directory) / "checkpoint.pkl" + if pkl_filename.exists(): + print(f"Attempting to load model from {pkl_filename}...") assert binary_operators is None assert unary_operators is None assert n_features_in is None with open(pkl_filename, "rb") as f: model = pkl.load(f) - # Change equation_file_ to be in the same dir as the pickle file - base_dir = os.path.dirname(pkl_filename) - base_equation_file = os.path.basename(model.equation_file_) - model.equation_file_ = os.path.join(base_dir, base_equation_file) # Update any parameters if necessary, such as # extra_sympy_mappings: model.set_params(**pysr_kwargs) + if "equations_" not in model.__dict__ or model.equations_ is None: model.refresh() return model - - # Else, we re-create it. - print( - f"{pkl_filename} does not exist, " - "so we must create the model from scratch." - ) - assert binary_operators is not None or unary_operators is not None - assert n_features_in is not None - - # TODO: copy .bak file if exists. - model = cls( - equation_file=str(equation_file), - binary_operators=binary_operators, - unary_operators=unary_operators, - **pysr_kwargs, - ) - - model.nout_ = nout - model.n_features_in_ = n_features_in - - if feature_names_in is None: - model.feature_names_in_ = np.array([f"x{i}" for i in range(n_features_in)]) - model.display_feature_names_in_ = np.array( - [f"x{_subscriptify(i)}" for i in range(n_features_in)] - ) else: - assert len(feature_names_in) == n_features_in - model.feature_names_in_ = feature_names_in - model.display_feature_names_in_ = feature_names_in + print( + f"Checkpoint file {pkl_filename} does not exist. " + "Attempting to recreate model from scratch..." + ) + csv_filename = Path(run_directory) / "hall_of_fame.csv" + csv_filename_bak = Path(run_directory) / "hall_of_fame.csv.bak" + if not csv_filename.exists() and not csv_filename_bak.exists(): + raise FileNotFoundError( + f"Hall of fame file `{csv_filename}` or `{csv_filename_bak}` does not exist. " + "Please pass a `run_directory` containing a valid checkpoint file." + ) + assert binary_operators is not None + assert unary_operators is not None + assert n_features_in is not None + model = cls( + binary_operators=binary_operators, + unary_operators=unary_operators, + **pysr_kwargs, + ) + model.nout_ = nout + model.n_features_in_ = n_features_in - if selection_mask is None: - model.selection_mask_ = np.ones(n_features_in, dtype=np.bool_) - else: - model.selection_mask_ = selection_mask + if feature_names_in is None: + model.feature_names_in_ = np.array( + [f"x{i}" for i in range(n_features_in)] + ) + model.display_feature_names_in_ = np.array( + [f"x{_subscriptify(i)}" for i in range(n_features_in)] + ) + else: + assert len(feature_names_in) == n_features_in + model.feature_names_in_ = feature_names_in + model.display_feature_names_in_ = feature_names_in - model.refresh(checkpoint_file=equation_file) + if selection_mask is None: + model.selection_mask_ = np.ones(n_features_in, dtype=np.bool_) + else: + model.selection_mask_ = selection_mask - return model + model.refresh(run_directory=run_directory) + + return model def __repr__(self): """ @@ -1259,6 +1265,14 @@ def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]: equations_.loc[idx_model_selection(equations_, self.model_selection)], ) + @property + def equation_file_(self): + raise NotImplementedError( + "PySRRegressor.equation_file_ is now deprecated. " + "Please use PySRRegressor.output_directory_ and PySRRegressor.run_id_ " + "instead. For loading, you should pass `run_directory`." + ) + def _setup_equation_file(self): """Set the pathname of the output directory.""" if self.warm_start and ( @@ -2115,7 +2129,7 @@ def fit( return self - def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None: + def refresh(self, run_directory: Optional[str] = None) -> None: """ Update self.equations_ with any new options passed. @@ -2128,8 +2142,9 @@ def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None: Path to checkpoint hall of fame file to be loaded. The default will use the set `equation_file_`. """ - if checkpoint_file is not None: - self.equation_file_ = checkpoint_file + if run_directory is not None: + self.output_directory_ = Path(run_directory).parent + self.run_id_ = Path(run_directory).name self.equation_file_contents_ = None check_is_fitted(self, attributes=["run_id_", "output_directory_"]) self.equations_ = self.get_hof() diff --git a/pysr/test/test.py b/pysr/test/test.py index 42ac6fb46..9c6e04e7d 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -22,7 +22,6 @@ _suggest_keywords, idx_model_selection, ) -from pysr.utils import _csv_filename_to_pkl_filename from .params import ( DEFAULT_NCYCLES, @@ -447,12 +446,12 @@ def test_load_model(self): csv_file_data = "\n".join([line.strip() for line in csv_file_data.split("\n")]) for from_backup in [False, True]: - rand_dir = Path(tempfile.mkdtemp()) - equation_filename = str(rand_dir / "equation.csv") + output_directory = Path(tempfile.mkdtemp()) + equation_filename = str(output_directory / "hall_of_fame.csv") with open(equation_filename + (".bak" if from_backup else ""), "w") as f: f.write(csv_file_data) model = PySRRegressor.from_file( - equation_filename, + run_directory=output_directory, n_features_in=5, feature_names_in=["f0", "f1", "f2", "f3", "f4"], binary_operators=["+", "*", "/", "-", "^"], @@ -478,15 +477,16 @@ def test_load_model_simple(self): early_stop_condition="stop_if(loss, complexity) = loss < 0.05 && complexity == 2", ) rand_dir = Path(tempfile.mkdtemp()) - equation_file = rand_dir / "equations.csv" + equation_file = rand_dir / "1" / "hall_of_fame.csv" model.set_params(temp_equation_file=False) - model.set_params(equation_file=equation_file) + model.set_params(output_directory=rand_dir) + model.set_params(run_id="1") model.fit(self.X, y) # lambda functions are removed from the pickling, so we need # to pass it during the loading: model2 = PySRRegressor.from_file( - model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2} + run_directory=rand_dir / "1", extra_sympy_mappings={"sq": lambda x: x**2} ) np.testing.assert_allclose(model.predict(self.X), model2.predict(self.X)) @@ -498,7 +498,7 @@ def test_load_model_simple(self): # pickle_file = rand_dir / "equations.pkl" model3 = PySRRegressor.from_file( - model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2} + run_directory=rand_dir / "1", extra_sympy_mappings={"sq": lambda x: x**2} ) np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X)) @@ -628,20 +628,6 @@ def test_feature_selection_handler(self): class TestMiscellaneous(unittest.TestCase): """Test miscellaneous functions.""" - def test_csv_to_pkl_conversion(self): - """Test that csv filename to pkl filename works as expected.""" - tmpdir = Path(tempfile.mkdtemp()) - equation_file = tmpdir / "equations.389479384.28378374.csv" - expected_pkl_file = tmpdir / "equations.389479384.28378374.pkl" - - # First, test inputting the paths: - test_pkl_file = _csv_filename_to_pkl_filename(equation_file) - self.assertEqual(test_pkl_file, str(expected_pkl_file)) - - # Next, test inputting the strings. - test_pkl_file = _csv_filename_to_pkl_filename(str(equation_file)) - self.assertEqual(test_pkl_file, str(expected_pkl_file)) - def test_pickle_with_temp_equation_file(self): """If we have a temporary equation file, unpickle the estimator.""" model = PySRRegressor( @@ -658,9 +644,15 @@ def test_pickle_with_temp_equation_file(self): y_predictions = model.predict(X) - equation_file_base = model.equation_file_ + equation_file_base = Path("outputs") / model.run_id_ / "hall_of_fame" + for i in range(1, nout + 1): + assert not os.path.exists(str(equation_file_base) + f"_output{i}.csv.bak") + + equation_file_base = ( + Path(model.output_directory_) / model.run_id_ / "hall_of_fame" + ) for i in range(1, nout + 1): - assert not os.path.exists(str(equation_file_base) + f".out{i}.bak") + assert os.path.exists(str(equation_file_base) + f"_output{i}.csv.bak") with tempfile.NamedTemporaryFile() as pickle_file: pkl.dump(model, pickle_file) diff --git a/pysr/utils.py b/pysr/utils.py index de7faf16e..9a87fc28e 100644 --- a/pysr/utils.py +++ b/pysr/utils.py @@ -1,6 +1,5 @@ import difflib import inspect -import os import re from pathlib import Path from typing import Any, List, TypeVar, Union @@ -14,22 +13,6 @@ PathLike = Union[str, Path] -def _csv_filename_to_pkl_filename(csv_filename: PathLike) -> PathLike: - if os.path.splitext(csv_filename)[1] == ".pkl": - return csv_filename - - # Assume that the csv filename is of the form "foo.csv" - assert str(csv_filename).endswith(".csv") - - dirname = str(os.path.dirname(csv_filename)) - basename = str(os.path.basename(csv_filename)) - base = str(os.path.splitext(basename)[0]) - - pkl_basename = base + ".pkl" - - return os.path.join(dirname, pkl_basename) - - _regexp_im = re.compile(r"\b(\d+\.\d+)im\b") _regexp_im_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)im\b") _regexp_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)\b") From 80195d8e67c89c886861e37ccef80c641ba1212e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 17 Nov 2024 16:13:11 +0000 Subject: [PATCH 09/92] test: fix latex tests --- pysr/test/test.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/pysr/test/test.py b/pysr/test/test.py index 9c6e04e7d..55d991882 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -517,32 +517,42 @@ def manually_create_model(equations, feature_names=None): if feature_names is None: feature_names = ["x0", "x1"] + output_directory = tempfile.mkdtemp() + run_id = "test" model = PySRRegressor( progress=False, niterations=1, extra_sympy_mappings={}, output_jax_format=False, model_selection="accuracy", - equation_file="equation_file.csv", + output_directory=output_directory, + run_id=run_id, ) + model.output_directory_ = output_directory + model.run_id_ = run_id + os.makedirs(Path(output_directory) / run_id, exist_ok=True) # Set up internal parameters as if it had been fitted: if isinstance(equations, list): # Multi-output. - model.equation_file_ = "equation_file.csv" model.nout_ = len(equations) model.selection_mask_ = None model.feature_names_in_ = np.array(feature_names, dtype=object) for i in range(model.nout_): equations[i]["complexity loss equation".split(" ")].to_csv( - f"equation_file.csv.out{i+1}.bak" + str( + Path(output_directory) + / run_id + / f"hall_of_fame_output{i+1}.csv.bak" + ) ) else: - model.equation_file_ = "equation_file.csv" model.nout_ = 1 model.selection_mask_ = None model.feature_names_in_ = np.array(feature_names, dtype=object) - equations["complexity loss equation".split(" ")].to_csv("equation_file.csv.bak") + equations["complexity loss equation".split(" ")].to_csv( + str(Path(output_directory) / run_id / "hall_of_fame.csv.bak") + ) model.refresh() From 2cfc4a3385af0a3c35d98461ee80cc3dfecbf467 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 17 Nov 2024 22:06:48 +0000 Subject: [PATCH 10/92] test: get more tests working --- pysr/param_groupings.yml | 4 +++- pysr/sr.py | 3 +-- pysr/test/test.py | 17 ++++++++++------- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pysr/param_groupings.yml b/pysr/param_groupings.yml index 0ff9d63da..2c9fa6702 100644 --- a/pysr/param_groupings.yml +++ b/pysr/param_groupings.yml @@ -22,6 +22,7 @@ - complexity_of_operators - complexity_of_constants - complexity_of_variables + - complexity_mapping - warmup_maxsize_by - use_frequency - use_frequency_in_tournament @@ -92,7 +93,8 @@ - delete_tempfiles - update - Exporting the Results: - - equation_file + - output_directory + - run_id - output_jax_format - output_torch_format - extra_sympy_mappings diff --git a/pysr/sr.py b/pysr/sr.py index 291ad5a39..478a96564 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1045,8 +1045,7 @@ def from_file( f"Hall of fame file `{csv_filename}` or `{csv_filename_bak}` does not exist. " "Please pass a `run_directory` containing a valid checkpoint file." ) - assert binary_operators is not None - assert unary_operators is not None + assert binary_operators is not None or unary_operators is not None assert n_features_in is not None model = cls( binary_operators=binary_operators, diff --git a/pysr/test/test.py b/pysr/test/test.py index 55d991882..3d394a0bc 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -1207,8 +1207,8 @@ def test_unit_propagation(self): """ X = np.ones((100, 3)) y = np.ones((100, 1)) - temp_dir = Path(tempfile.mkdtemp()) - equation_file = str(temp_dir / "equation_file.csv") + output_dir = tempfile.mkdtemp() + run_id = "test" model = PySRRegressor( binary_operators=["+", "*"], early_stop_condition="(l, c) -> l < 1e-6 && c == 3", @@ -1223,7 +1223,8 @@ def test_unit_propagation(self): deterministic=True, procs=0, random_state=0, - equation_file=equation_file, + output_directory=output_dir, + run_id=run_id, warm_start=True, ) model.fit( @@ -1243,16 +1244,18 @@ def test_unit_propagation(self): ) # With pkl file: - pkl_file = str(temp_dir / "equation_file.pkl") - model2 = PySRRegressor.from_file(pkl_file) + run_directory = str(Path(output_dir) / run_id) + model2 = PySRRegressor.from_file(run_directory=run_directory) best2 = model2.get_best() self.assertIn("x0", best2["equation"]) # From csv file alone (we need to delete pkl file:) # First, we delete the pkl file: - os.remove(pkl_file) + os.remove(Path(run_directory) / "checkpoint.pkl") model3 = PySRRegressor.from_file( - equation_file, binary_operators=["+", "*"], n_features_in=X.shape[1] + run_directory=run_directory, + binary_operators=["+", "*"], + n_features_in=X.shape[1], ) best3 = model3.get_best() self.assertIn("x0", best3["equation"]) From 829bb575d1efa306a76c3e2b26030a18c35428e9 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 17 Nov 2024 22:24:31 +0000 Subject: [PATCH 11/92] deps: bump overall julia requirements --- pysr/juliapkg.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json index 0c68a4d50..fbc308a87 100644 --- a/pysr/juliapkg.json +++ b/pysr/juliapkg.json @@ -1,5 +1,5 @@ { - "julia": "~1.6.7, ~1.7, ~1.8, ~1.9, =1.10.0, ~1.10.3", + "julia": "=1.10.0, 1.10.3", "packages": { "SymbolicRegression": { "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", From 04042eff722847be70a2168d1fee8a99db98b371 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 17 Nov 2024 22:26:00 +0000 Subject: [PATCH 12/92] test: more fixes to test directories --- pysr/test/test_startup.py | 7 +++++-- pysr/test/test_torch.py | 26 +++++++++++++++++--------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/pysr/test/test_startup.py b/pysr/test/test_startup.py index 6ad64b624..65810f013 100644 --- a/pysr/test/test_startup.py +++ b/pysr/test/test_startup.py @@ -43,7 +43,8 @@ def test_warm_start_from_file(self): ) model.warm_start = True model.temp_equation_file = False - model.equation_file = Path(tmpdirname) / "equations.csv" + model.output_directory = tmpdirname + model.run_id = "test" model.deterministic = True model.multithreading = False model.random_state = 0 @@ -76,7 +77,9 @@ def test_warm_start_from_file(self): y = np.load("{y_file}") print("Loading model from file") - model = PySRRegressor.from_file("{model.equation_file}") + model = PySRRegressor.from_file( + run_directory="{str(Path(tmpdirname) / model.run_id_)}" + ) assert model.julia_state_ is not None diff --git a/pysr/test/test_torch.py b/pysr/test/test_torch.py index af4fb6575..50f5285c9 100644 --- a/pysr/test/test_torch.py +++ b/pysr/test/test_torch.py @@ -1,4 +1,5 @@ import unittest +from pathlib import Path import numpy as np import pandas as pd @@ -48,9 +49,12 @@ def test_pipeline_pandas(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv("equation_file.csv.bak") + for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]: + equations["Complexity Loss Equation".split(" ")].to_csv( + Path(model.output_directory_) / model.run_id_ / fname + ) - model.refresh(checkpoint_file="equation_file.csv") + model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_)) tformat = model.pytorch() self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)") @@ -79,9 +83,12 @@ def test_pipeline(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv("equation_file.csv.bak") + for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]: + equations["Complexity Loss Equation".split(" ")].to_csv( + Path(model.output_directory_) / model.run_id_ / fname + ) - model.refresh(checkpoint_file="equation_file.csv") + model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_)) tformat = model.pytorch() self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)") @@ -129,16 +136,17 @@ def test_custom_operator(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv( - "equation_file_custom_operator.csv.bak" - ) + for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]: + equations["Complexity Loss Equation".split(" ")].to_csv( + Path(model.output_directory_) / model.run_id_ / fname + ) model.set_params( - equation_file="equation_file_custom_operator.csv", extra_sympy_mappings={"mycustomoperator": sympy.sin}, extra_torch_mappings={"mycustomoperator": self.torch.sin}, ) - model.refresh(checkpoint_file="equation_file_custom_operator.csv") + # TODO: We shouldn't need to specify the run directory here. + model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_)) self.assertEqual(str(model.sympy()), "sin(x1)") # Will automatically use the set global state from get_hof. From 1817202c138f5acefc72241c5027b2669937b760 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 17 Nov 2024 22:26:11 +0000 Subject: [PATCH 13/92] test: clean up version check --- pysr/test/test_startup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pysr/test/test_startup.py b/pysr/test/test_startup.py index 65810f013..a92010cee 100644 --- a/pysr/test/test_startup.py +++ b/pysr/test/test_startup.py @@ -133,8 +133,6 @@ def test_bad_startup_options(self): self.assertIn(warning_test["msg"], result.stderr.decode()) def test_notebook(self): - if jl_version < (1, 9, 0): - self.skipTest("Julia version too old") if platform.system() == "Windows": self.skipTest("Notebook test incompatible with Windows") if not os.access(Path(__file__).parent, os.W_OK): From a271659b9bf7e591bd62f559fa4172187087b40f Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 17 Nov 2024 22:29:43 +0000 Subject: [PATCH 14/92] test: fix paths in jax test --- pysr/test/test_jax.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pysr/test/test_jax.py b/pysr/test/test_jax.py index 8b79f5060..827cb9817 100644 --- a/pysr/test/test_jax.py +++ b/pysr/test/test_jax.py @@ -1,5 +1,6 @@ import unittest from functools import partial +from pathlib import Path import numpy as np import pandas as pd @@ -46,9 +47,12 @@ def test_pipeline_pandas(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv("equation_file.csv.bak") + for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]: + equations["Complexity Loss Equation".split(" ")].to_csv( + Path(model.output_directory_) / model.run_id_ / fname + ) - model.refresh(checkpoint_file="equation_file.csv") + model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_)) jformat = model.jax() np.testing.assert_almost_equal( @@ -71,9 +75,12 @@ def test_pipeline(self): } ) - equations["Complexity Loss Equation".split(" ")].to_csv("equation_file.csv.bak") + for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]: + equations["Complexity Loss Equation".split(" ")].to_csv( + Path(model.output_directory_) / model.run_id_ / fname + ) - model.refresh(checkpoint_file="equation_file.csv") + model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_)) jformat = model.jax() np.testing.assert_almost_equal( From f1f64e61811d65eb93cfeb6a5c58af9c40f10581 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 17 Nov 2024 22:48:47 +0000 Subject: [PATCH 15/92] deps: update default versions for Docker --- .github/workflows/CI.yml | 4 ++-- Apptainer.def | 4 ++-- Dockerfile | 4 ++-- pysr/test/test_dev.py | 4 ++-- pysr/test/test_dev_pysr.dockerfile | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 1ddf28ea7..a7a4c3719 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -97,8 +97,8 @@ jobs: julia-version: ['1'] include: - os: ubuntu-latest - python-version: '3.8' - julia-version: '1.6' + python-version: '3.9' + julia-version: '1.10' steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 diff --git a/Apptainer.def b/Apptainer.def index 962f81687..fa4d87426 100644 --- a/Apptainer.def +++ b/Apptainer.def @@ -1,10 +1,10 @@ # Build an Apptainer SIF file containing a working copy of PySR and its prereqs Bootstrap: docker -From: julia:1.10.4-bullseye +From: julia:1.11.1-bullseye Stage: jl Bootstrap: docker -From: python:3.12-bullseye +From: python:3.12.6-bullseye Stage: runtime %environment diff --git a/Dockerfile b/Dockerfile index ed0b5f891..e87ce270b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ # This builds a dockerfile containing a working copy of PySR # with all pre-requisites installed. -ARG JLVERSION=1.10.4 -ARG PYVERSION=3.12.2 +ARG JLVERSION=1.11.1 +ARG PYVERSION=3.12.6 ARG BASE_IMAGE=bullseye FROM julia:${JLVERSION}-${BASE_IMAGE} AS jl diff --git a/pysr/test/test_dev.py b/pysr/test/test_dev.py index b8a2b4645..a0f4f7f5d 100644 --- a/pysr/test/test_dev.py +++ b/pysr/test/test_dev.py @@ -7,8 +7,8 @@ class TestDev(unittest.TestCase): def test_simple_change_to_backend(self): """Test that we can use a development version of SymbolicRegression.jl""" - PYSR_TEST_JULIA_VERSION = os.environ.get("PYSR_TEST_JULIA_VERSION", "1.6") - PYSR_TEST_PYTHON_VERSION = os.environ.get("PYSR_TEST_PYTHON_VERSION", "3.9") + PYSR_TEST_JULIA_VERSION = os.environ.get("PYSR_TEST_JULIA_VERSION", "1.11") + PYSR_TEST_PYTHON_VERSION = os.environ.get("PYSR_TEST_PYTHON_VERSION", "3.12") build_result = subprocess.run( [ "docker", diff --git a/pysr/test/test_dev_pysr.dockerfile b/pysr/test/test_dev_pysr.dockerfile index 2978e82b7..421836a8f 100644 --- a/pysr/test/test_dev_pysr.dockerfile +++ b/pysr/test/test_dev_pysr.dockerfile @@ -2,8 +2,8 @@ # tries to manually edit SymbolicRegression.jl and # use it from PySR. -ARG JLVERSION=1.9.4 -ARG PYVERSION=3.11.6 +ARG JLVERSION=1.11.1 +ARG PYVERSION=3.12.6 ARG BASE_IMAGE=bullseye FROM julia:${JLVERSION}-${BASE_IMAGE} AS jl From f4a083cb33ab20afb66443632885fa7fc3e05fc9 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 17 Nov 2024 22:51:55 +0000 Subject: [PATCH 16/92] deps: fix SR version --- pysr/juliapkg.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json index fbc308a87..13b301261 100644 --- a/pysr/juliapkg.json +++ b/pysr/juliapkg.json @@ -3,7 +3,7 @@ "packages": { "SymbolicRegression": { "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", - "version": "1.0.0 - 1.0" + "version": "=1.0.0" }, "Serialization": { "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b", From a38119ce0aa34e36bf5458ebcd01d11eb5cd6fef Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 17 Nov 2024 23:16:47 +0000 Subject: [PATCH 17/92] deps: bump to 1.0.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 64f4673a7..62101f0c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pysr" -version = "0.19.4" +version = "1.0.0" authors = [ {name = "Miles Cranmer", email = "miles.cranmer@gmail.com"}, ] From 09b1930a1141c4034a09c4e09906b1077fa7d8d5 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 22 Nov 2024 20:51:11 +0000 Subject: [PATCH 18/92] deps: bump backend to 1.0.1 --- pysr/juliapkg.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json index 13b301261..6a7c97348 100644 --- a/pysr/juliapkg.json +++ b/pysr/juliapkg.json @@ -3,7 +3,7 @@ "packages": { "SymbolicRegression": { "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", - "version": "=1.0.0" + "version": "=1.0.1" }, "Serialization": { "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b", From 0d1da79d46d9596292e41c3579b1c3c7f5590685 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 23 Nov 2024 22:58:36 +0000 Subject: [PATCH 19/92] fix: missing clearing of `equation_file_contents_` --- pysr/sr.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pysr/sr.py b/pysr/sr.py index 478a96564..0fe34bdf8 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1301,6 +1301,9 @@ def _setup_equation_file(self): if self.temp_equation_file: assert self.output_directory is None + def _clear_equation_file_contents(self): + self.equation_file_contents_ = None + def _validate_and_modify_params(self) -> _DynamicallySetParams: """ Ensure parameters passed at initialization are valid. @@ -2037,6 +2040,7 @@ def fit( self.y_units_ = None self._setup_equation_file() + self._clear_equation_file_contents() runtime_params = self._validate_and_modify_params() From bf1552639652c409e226b5c0a848d6b91905eb69 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 23 Nov 2024 23:03:54 +0000 Subject: [PATCH 20/92] chore: ignore mypy cache --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 5dfa00d5e..c864e9903 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ venv requirements-dev.lock requirements.lock outputs +.mypy_cache From 59ff59a4da51ad567f7cad7f62609a4ca4461470 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 23 Nov 2024 23:04:12 +0000 Subject: [PATCH 21/92] fix: typing issue for `output_directory_` --- pysr/sr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index 0fe34bdf8..84691f512 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2146,7 +2146,7 @@ def refresh(self, run_directory: Optional[str] = None) -> None: The default will use the set `equation_file_`. """ if run_directory is not None: - self.output_directory_ = Path(run_directory).parent + self.output_directory_ = str(Path(run_directory).parent) self.run_id_ = Path(run_directory).name self.equation_file_contents_ = None check_is_fitted(self, attributes=["run_id_", "output_directory_"]) From 22e1f9fd946adefc09bc2239796d78a38a0c1498 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 01:20:41 +0000 Subject: [PATCH 22/92] feat: start ExpressionOptions integration --- pysr/__init__.py | 8 ++ pysr/export.py | 109 +++++++++++++++++++++++++ pysr/expression_types.py | 126 +++++++++++++++++++++++++++++ pysr/sr.py | 168 +++++++++++---------------------------- 4 files changed, 290 insertions(+), 121 deletions(-) create mode 100644 pysr/export.py create mode 100644 pysr/expression_types.py diff --git a/pysr/__init__.py b/pysr/__init__.py index b40ee840e..86336ef11 100644 --- a/pysr/__init__.py +++ b/pysr/__init__.py @@ -7,6 +7,11 @@ from .deprecated import best, best_callable, best_row, best_tex, install, pysr from .export_jax import sympy2jax from .export_torch import sympy2torch +from .expression_types import ( + AbstractExpressionOptions, + ExpressionOptions, + TemplateExpressionOptions, +) from .julia_extensions import load_all_packages from .sr import PySRRegressor @@ -22,6 +27,9 @@ "install", "load_all_packages", "PySRRegressor", + "AbstractExpressionOptions", + "ExpressionOptions", + "TemplateExpressionOptions", "best", "best_callable", "best_row", diff --git a/pysr/export.py b/pysr/export.py new file mode 100644 index 000000000..3042bc73e --- /dev/null +++ b/pysr/export.py @@ -0,0 +1,109 @@ +import copy +from typing import Callable, Dict, Optional, Union + +import numpy as np +import pandas as pd +from numpy.typing import NDArray + +from .export_jax import sympy2jax +from .export_numpy import sympy2numpy +from .export_sympy import create_sympy_symbols, pysr2sympy +from .export_torch import sympy2torch +from .utils import ArrayLike + + +def add_export_formats( + output: pd.DataFrame, + *, + feature_names_in: ArrayLike[str], + selection_mask: Union[NDArray[np.bool_], None] = None, + extra_sympy_mappings: Optional[Dict[str, Callable]] = None, + extra_torch_mappings: Optional[Dict[Callable, Callable]] = None, + output_torch_format: bool = False, + extra_jax_mappings: Optional[Dict[Callable, str]] = None, + output_jax_format: bool = False, +) -> pd.DataFrame: + + output = copy.deepcopy(output) + + scores = [] + lastMSE = None + lastComplexity = 0 + sympy_format = [] + lambda_format = [] + jax_format = [] + torch_format = [] + + for _, eqn_row in output.iterrows(): + eqn = pysr2sympy( + eqn_row["equation"], + feature_names_in=feature_names_in, + extra_sympy_mappings=extra_sympy_mappings, + ) + sympy_format.append(eqn) + + # NumPy: + sympy_symbols = create_sympy_symbols(feature_names_in) + lambda_format.append( + sympy2numpy( + eqn, + sympy_symbols, + selection=selection_mask, + ) + ) + + # JAX: + if output_jax_format: + func, params = sympy2jax( + eqn, + sympy_symbols, + selection=selection_mask, + extra_jax_mappings=extra_jax_mappings, + ) + jax_format.append({"callable": func, "parameters": params}) + + # Torch: + if output_torch_format: + module = sympy2torch( + eqn, + sympy_symbols, + selection=selection_mask, + extra_torch_mappings=extra_torch_mappings, + ) + torch_format.append(module) + + curMSE = eqn_row["loss"] + curComplexity = eqn_row["complexity"] + + if lastMSE is None: + cur_score = 0.0 + else: + if curMSE > 0.0: + # TODO Move this to more obvious function/file. + cur_score = -np.log(curMSE / lastMSE) / (curComplexity - lastComplexity) + else: + cur_score = np.inf + + scores.append(cur_score) + lastMSE = curMSE + lastComplexity = curComplexity + + output["score"] = np.array(scores) + output["sympy_format"] = sympy_format + output["lambda_format"] = lambda_format + output_cols = [ + "complexity", + "loss", + "score", + "equation", + "sympy_format", + "lambda_format", + ] + if output_jax_format: + output_cols += ["jax_format"] + output["jax_format"] = jax_format + if output_torch_format: + output_cols += ["torch_format"] + output["torch_format"] = torch_format + + return output[output_cols] diff --git a/pysr/expression_types.py b/pysr/expression_types.py new file mode 100644 index 000000000..bd4615e31 --- /dev/null +++ b/pysr/expression_types.py @@ -0,0 +1,126 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Literal, Optional + +from .julia_import import SymbolicRegression, jl + + +class AbstractExpressionOptions(ABC): + """Abstract base class for expression types. + + This basically just holds the options for the expression type, + as well as explains how to parse and evaluate them. + + All expression types must implement: + + 1. julia_expression_type(): The actual expression type, returned as a Julia object. + This will get stored as `expression_type` in `SymbolicRegression.Options`. + 2. julia_expression_options(): Method to create the expression options, returned as a Julia object. + These will get stored as `expression_options` in `SymbolicRegression.Options`. + 3. load_from(): whether expressions are read from the hall of fame file, or loaded from Julia. + """ + + @abstractmethod + def julia_expression_type(self) -> Any: + """The expression type""" + pass + + @abstractmethod + def julia_expression_options(self) -> Any: + """The expression options""" + pass + + @abstractmethod + def load_from(self) -> Literal["file", "julia"]: + """If expressions are read from the hall of fame file, or loaded from Julia""" + pass + + +class ExpressionOptions(AbstractExpressionOptions): + """Options for the regular Expression expression type""" + + def julia_expression_type(self): + return SymbolicRegression.Expression + + def julia_expression_options(self): + return jl.NamedTuple() + + def load_from(self): + return "file" + + +class TemplateExpressionOptions(AbstractExpressionOptions): + """The structure of a template expression. + + This class allows you to specify how multiple sub-expressions should be combined + in a structured way, with constraints on which variables each sub-expression can use. + Pass this to PySRRegressor with the `expression_options` argument when you are using + the `TemplateExpression` expression type. + + Parameters + ---------- + function_symbols : list[str] + List of symbols representing the inner expressions (e.g., ["f", "g"]). + These will be used as keys in the template structure. + combine : str + Julia function string that defines how the sub-expressions are combined. + Takes a NamedTuple of expressions and a tuple of data vectors. + For example: "((; f, g), (x1, x2, x3)) -> f(x1, x2) + g(x3)^2" + would constrain f to use x1,x2 and g to use x3. + num_features : dict[str, int] + Dictionary mapping function symbols to the number of features each can use. + For example: {"f": 2, "g": 1} means f takes 2 inputs and g takes 1. + If not provided, will be inferred from the combine function. + + Examples + -------- + ```python + # Create template that combines f(x1, x2) and g(x3): + template_options = TemplateExpressionOptions( + function_symbols=["f", "g"], + combine="((; f, g), (x1, x2, x3)) -> sin(f(x1, x2)) + g(x3)^2", + ) + + # Use in PySRRegressor: + model = PySRRegressor( + expression_options=template_options + ) + ``` + """ + + def __init__( + self, + function_symbols: List[str], + combine: str, + num_features: Optional[Dict[str, int]] = None, + ): + self.function_symbols = function_symbols + self.combine = combine + self.num_features = num_features + + def julia_expression_type(self): + return SymbolicRegression.TemplateExpression + + def julia_expression_options(self): + f_combine = jl.seval(self.combine) + creator = jl.seval( + """ + function _pysr_create_template_structure( + @nospecialize(function_symbols::AbstractVector), + @nospecialize(combine::Function), + @nospecialize(num_features::Union{Nothing,AbstractDict}) + ) + tuple_symbol = (map(Symbol, function_symbols)..., ) + num_features = if num_features === nothing + nothing + else + (; num_features...) + end + return SymbolicRegression.TemplateStructure{tuple_symbol}(combine, num_features) + end + """ + ) + structure = creator(self.function_symbols, f_combine, self.num_features) + return jl.seval("NamedTuple{(:structure,)}")((structure,)) + + def load_from(self): + return "julia" diff --git a/pysr/sr.py b/pysr/sr.py index 84691f512..750b87d2e 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -24,16 +24,15 @@ from .denoising import denoise, multi_denoise from .deprecated import DEPRECATED_KWARGS -from .export_jax import sympy2jax +from .export import add_export_formats from .export_latex import ( sympy2latex, sympy2latextable, sympy2multilatextable, with_preamble, ) -from .export_numpy import sympy2numpy -from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy -from .export_torch import sympy2torch +from .export_sympy import assert_valid_sympy_symbol +from .expression_types import AbstractExpressionOptions, ExpressionOptions from .feature_selection import run_feature_selection from .julia_extensions import load_required_packages from .julia_helpers import ( @@ -187,6 +186,25 @@ def _check_assertions( ) +def _validate_export_mappings(extra_jax_mappings, extra_torch_mappings): + # It is expected extra_jax/torch_mappings will be updated after fit. + # Thus, validation is performed here instead of in _validate_init_params + if extra_jax_mappings is not None: + for value in extra_jax_mappings.values(): + if not isinstance(value, str): + raise ValueError( + "extra_jax_mappings must have keys that are strings! " + "e.g., {sympy.sqrt: 'jnp.sqrt'}." + ) + if extra_torch_mappings is not None: + for value in extra_torch_mappings.values(): + if not callable(value): + raise ValueError( + "extra_torch_mappings must be callable functions! " + "e.g., {sympy.sqrt: torch.sqrt}." + ) + + # Class validation constants VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"] @@ -726,6 +744,7 @@ def __init__( *, binary_operators: Optional[List[str]] = None, unary_operators: Optional[List[str]] = None, + expression_options: AbstractExpressionOptions = ExpressionOptions(), niterations: int = 40, populations: int = 15, population_size: int = 33, @@ -818,6 +837,7 @@ def __init__( self.model_selection = model_selection self.binary_operators = binary_operators self.unary_operators = unary_operators + self.expression_options = expression_options self.niterations = niterations self.populations = populations self.population_size = population_size @@ -1830,6 +1850,8 @@ def _run( complexity_of_constants=self.complexity_of_constants, complexity_of_variables=complexity_of_variables, complexity_mapping=self.complexity_mapping, + expression_type=self.expression_options.julia_expression_type(), + expression_options=self.expression_options.julia_expression_options(), nested_constraints=nested_constraints, elementwise_loss=custom_loss, loss_function=custom_full_objective, @@ -1959,7 +1981,7 @@ def _run( self.julia_state_stream_ = jl_serialize(out) # Set attributes - self.equations_ = self.get_hof() + self.equations_ = self.get_hof(out) ALREADY_RAN = True @@ -2148,7 +2170,7 @@ def refresh(self, run_directory: Optional[str] = None) -> None: if run_directory is not None: self.output_directory_ = str(Path(run_directory).parent) self.run_id_ = Path(run_directory).name - self.equation_file_contents_ = None + self._clear_equation_file_contents() check_is_fitted(self, attributes=["run_id_", "output_directory_"]) self.equations_ = self.get_hof() @@ -2400,8 +2422,8 @@ def _postprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: return df - def get_hof(self): - """Get the equations from a hall of fame file. + def get_hof(self, search_output: Optional[Any] = None): + """Get the equations from a hall of fame file or search output. If no arguments entered, the ones used previously from a call to PySR will be used. @@ -2416,122 +2438,26 @@ def get_hof(self): "feature_names_in_", ], ) - if ( + should_read_from_file = self.expression_options.load_from() == "file" and ( not hasattr(self, "equation_file_contents_") - ) or self.equation_file_contents_ is None: + or self.equation_file_contents_ is None + ) + if should_read_from_file: self.equation_file_contents_ = self._read_equation_file() - # It is expected extra_jax/torch_mappings will be updated after fit. - # Thus, validation is performed here instead of in _validate_init_params - extra_jax_mappings = self.extra_jax_mappings - extra_torch_mappings = self.extra_torch_mappings - if extra_jax_mappings is not None: - for value in extra_jax_mappings.values(): - if not isinstance(value, str): - raise ValueError( - "extra_jax_mappings must have keys that are strings! " - "e.g., {sympy.sqrt: 'jnp.sqrt'}." - ) - else: - extra_jax_mappings = {} - if extra_torch_mappings is not None: - for value in extra_torch_mappings.values(): - if not callable(value): - raise ValueError( - "extra_torch_mappings must be callable functions! " - "e.g., {sympy.sqrt: torch.sqrt}." - ) - else: - extra_torch_mappings = {} - - ret_outputs = [] - - equation_file_contents = copy.deepcopy(self.equation_file_contents_) - - for output in equation_file_contents: - scores = [] - lastMSE = None - lastComplexity = 0 - sympy_format = [] - lambda_format = [] - jax_format = [] - torch_format = [] - - for _, eqn_row in output.iterrows(): - eqn = pysr2sympy( - eqn_row["equation"], - feature_names_in=self.feature_names_in_, - extra_sympy_mappings=self.extra_sympy_mappings, - ) - sympy_format.append(eqn) - - # NumPy: - sympy_symbols = create_sympy_symbols(self.feature_names_in_) - lambda_format.append( - sympy2numpy( - eqn, - sympy_symbols, - selection=self.selection_mask_, - ) - ) - - # JAX: - if self.output_jax_format: - func, params = sympy2jax( - eqn, - sympy_symbols, - selection=self.selection_mask_, - extra_jax_mappings=self.extra_jax_mappings, - ) - jax_format.append({"callable": func, "parameters": params}) - - # Torch: - if self.output_torch_format: - module = sympy2torch( - eqn, - sympy_symbols, - selection=self.selection_mask_, - extra_torch_mappings=self.extra_torch_mappings, - ) - torch_format.append(module) - - curMSE = eqn_row["loss"] - curComplexity = eqn_row["complexity"] - - if lastMSE is None: - cur_score = 0.0 - else: - if curMSE > 0.0: - # TODO Move this to more obvious function/file. - cur_score = -np.log(curMSE / lastMSE) / ( - curComplexity - lastComplexity - ) - else: - cur_score = np.inf - - scores.append(cur_score) - lastMSE = curMSE - lastComplexity = curComplexity - - output["score"] = np.array(scores) - output["sympy_format"] = sympy_format - output["lambda_format"] = lambda_format - output_cols = [ - "complexity", - "loss", - "score", - "equation", - "sympy_format", - "lambda_format", - ] - if self.output_jax_format: - output_cols += ["jax_format"] - output["jax_format"] = jax_format - if self.output_torch_format: - output_cols += ["torch_format"] - output["torch_format"] = torch_format - - ret_outputs.append(output[output_cols]) + ret_outputs = [ + add_export_formats( + output, + feature_names_in=self.feature_names_in_, + selection_mask=self.selection_mask_, + extra_sympy_mappings=self.extra_sympy_mappings, + extra_torch_mappings=self.extra_torch_mappings, + output_jax_format=self.output_jax_format, + extra_jax_mappings=self.extra_jax_mappings, + output_torch_format=self.output_torch_format, + ) + for output in self.equation_file_contents_ + ] if self.nout_ > 1: return ret_outputs From 2608fddd6b83c6f8a090880cfcf1d5b4f62a1986 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 03:08:44 +0000 Subject: [PATCH 23/92] fix: simplify defaults to true --- pysr/sr.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 750b87d2e..512f66887 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -788,7 +788,7 @@ def __init__( migration: bool = True, hof_migration: bool = True, topn: int = 12, - should_simplify: Optional[bool] = None, + should_simplify: bool = True, should_optimize_constants: bool = True, optimizer_algorithm: Literal["BFGS", "NelderMead"] = "BFGS", optimizer_nrestarts: int = 2, @@ -1758,14 +1758,17 @@ def _run( unary_operators=unary_operators, extra_sympy_mappings=self.extra_sympy_mappings, ) - constraints = _process_constraints( - binary_operators=binary_operators, - unary_operators=unary_operators, - constraints=constraints, - ) - - una_constraints = [constraints[op] for op in unary_operators] - bin_constraints = [constraints[op] for op in binary_operators] + if constraints is not None: + constraints = _process_constraints( + binary_operators=binary_operators, + unary_operators=unary_operators, + constraints=constraints, + ) + una_constraints = [constraints[op] for op in unary_operators] + bin_constraints = [constraints[op] for op in binary_operators] + else: + una_constraints = None + bin_constraints = None # Parse dict into Julia Dict for nested constraints:: if nested_constraints is not None: From f3d3358162da8c67e34304fd316c8081457a79cf Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 03:10:46 +0000 Subject: [PATCH 24/92] feat: don't fail search if checkpointing fails --- pysr/sr.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index 512f66887..764681f38 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1201,7 +1201,10 @@ def _checkpoint(self): # Save model state: self.show_pickle_warnings_ = False with open(self.get_pkl_filename(), "wb") as f: - pkl.dump(self, f) + try: + pkl.dump(self, f) + except Exception as e: + print(f"Error checkpointing model: {e}") self.show_pickle_warnings_ = True def get_pkl_filename(self) -> Path: From 0f2726c527cfff0fd9e4f6c21991bf1b5a0e299f Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 03:11:52 +0000 Subject: [PATCH 25/92] feat: get TemplateExpression fit working --- pysr/expression_types.py | 82 +++++++++++++++++++++++++++++++++++++++- pysr/sr.py | 14 +------ 2 files changed, 83 insertions(+), 13 deletions(-) diff --git a/pysr/expression_types.py b/pysr/expression_types.py index bd4615e31..a8966168b 100644 --- a/pysr/expression_types.py +++ b/pysr/expression_types.py @@ -1,8 +1,17 @@ +import copy from abc import ABC, abstractmethod -from typing import Any, Dict, List, Literal, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional +import numpy as np +import pandas as pd + +from .export import add_export_formats +from .julia_helpers import jl_array from .julia_import import SymbolicRegression, jl +if TYPE_CHECKING: + from .sr import PySRRegressor + class AbstractExpressionOptions(ABC): """Abstract base class for expression types. @@ -17,6 +26,9 @@ class AbstractExpressionOptions(ABC): 2. julia_expression_options(): Method to create the expression options, returned as a Julia object. These will get stored as `expression_options` in `SymbolicRegression.Options`. 3. load_from(): whether expressions are read from the hall of fame file, or loaded from Julia. + + You can also optionally implement create_exports(), which will be used to + create the exports of the equations. """ @abstractmethod @@ -34,6 +46,23 @@ def load_from(self) -> Literal["file", "julia"]: """If expressions are read from the hall of fame file, or loaded from Julia""" pass + def create_exports( + self, + model: "PySRRegressor", + equations: pd.DataFrame, + search_output: Any, + ) -> pd.DataFrame: + return add_export_formats( + equations, + feature_names_in=model.feature_names_in_, + selection_mask=model.selection_mask_, + extra_sympy_mappings=model.extra_sympy_mappings, + extra_torch_mappings=model.extra_torch_mappings, + output_jax_format=model.output_jax_format, + extra_jax_mappings=model.extra_jax_mappings, + output_torch_format=model.output_torch_format, + ) + class ExpressionOptions(AbstractExpressionOptions): """Options for the regular Expression expression type""" @@ -48,6 +77,17 @@ def load_from(self): return "file" +class CallableJuliaExpression: + def __init__(self, expression): + self.expression = expression + + def __call__(self, X: np.ndarray): + if not isinstance(X, np.ndarray): + raise ValueError("X must be a numpy array") + raw_output = self.expression(jl_array(X.T)) + return np.array(raw_output).T + + class TemplateExpressionOptions(AbstractExpressionOptions): """The structure of a template expression. @@ -124,3 +164,43 @@ def julia_expression_options(self): def load_from(self): return "julia" + + def create_exports( + self, + model: "PySRRegressor", + equations: pd.DataFrame, + search_output: Any, + ) -> pd.DataFrame: + equations = copy.deepcopy(equations) + + (_, out_hof) = search_output + expressions = [] + callables = [] + scores = [] + + lastMSE = None + lastComplexity = 0 + + for _, row in equations.iterrows(): + curComplexity = row["complexity"] + curMSE = row["loss"] + expression = out_hof.members[curComplexity - 1].tree + expressions.append(expression) + callables.append(CallableJuliaExpression(expression)) + + if lastMSE is None: + cur_score = 0.0 + else: + if curMSE > 0.0: + # TODO Move this to more obvious function/file. + cur_score = -np.log(curMSE / lastMSE) / ( + curComplexity - lastComplexity + ) + else: + cur_score = np.inf + scores.append(cur_score) + + equations["julia_expression"] = expressions + equations["lambda_format"] = callables + equations["score"] = np.array(scores) + return equations diff --git a/pysr/sr.py b/pysr/sr.py index 764681f38..14ed9fed6 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -47,7 +47,6 @@ from .julia_import import SymbolicRegression, jl from .utils import ( ArrayLike, - PathLike, _preprocess_julia_floats, _safe_check_feature_names_in, _subscriptify, @@ -2444,7 +2443,7 @@ def get_hof(self, search_output: Optional[Any] = None): "feature_names_in_", ], ) - should_read_from_file = self.expression_options.load_from() == "file" and ( + should_read_from_file = ( not hasattr(self, "equation_file_contents_") or self.equation_file_contents_ is None ) @@ -2452,16 +2451,7 @@ def get_hof(self, search_output: Optional[Any] = None): self.equation_file_contents_ = self._read_equation_file() ret_outputs = [ - add_export_formats( - output, - feature_names_in=self.feature_names_in_, - selection_mask=self.selection_mask_, - extra_sympy_mappings=self.extra_sympy_mappings, - extra_torch_mappings=self.extra_torch_mappings, - output_jax_format=self.output_jax_format, - extra_jax_mappings=self.extra_jax_mappings, - output_torch_format=self.output_torch_format, - ) + self.expression_options.create_exports(self, output, search_output) for output in self.equation_file_contents_ ] From b10f7877b25ced6d146e810e89c0beb821eb6148 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 03:20:45 +0000 Subject: [PATCH 26/92] feat: add assertions --- pysr/expression_types.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pysr/expression_types.py b/pysr/expression_types.py index a8966168b..94c2b5b52 100644 --- a/pysr/expression_types.py +++ b/pysr/expression_types.py @@ -171,6 +171,8 @@ def create_exports( equations: pd.DataFrame, search_output: Any, ) -> pd.DataFrame: + assert search_output is not None + equations = copy.deepcopy(equations) (_, out_hof) = search_output From d3503f5d48be196b38999b7394d728854ae17e5a Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 03:46:28 +0000 Subject: [PATCH 27/92] docs: document `expression_options` --- pysr/expression_types.py | 14 +------------- pysr/param_groupings.yml | 1 + pysr/sr.py | 6 ++++++ 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/pysr/expression_types.py b/pysr/expression_types.py index 94c2b5b52..239e20b55 100644 --- a/pysr/expression_types.py +++ b/pysr/expression_types.py @@ -1,6 +1,6 @@ import copy from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional import numpy as np import pandas as pd @@ -25,7 +25,6 @@ class AbstractExpressionOptions(ABC): This will get stored as `expression_type` in `SymbolicRegression.Options`. 2. julia_expression_options(): Method to create the expression options, returned as a Julia object. These will get stored as `expression_options` in `SymbolicRegression.Options`. - 3. load_from(): whether expressions are read from the hall of fame file, or loaded from Julia. You can also optionally implement create_exports(), which will be used to create the exports of the equations. @@ -41,11 +40,6 @@ def julia_expression_options(self) -> Any: """The expression options""" pass - @abstractmethod - def load_from(self) -> Literal["file", "julia"]: - """If expressions are read from the hall of fame file, or loaded from Julia""" - pass - def create_exports( self, model: "PySRRegressor", @@ -73,9 +67,6 @@ def julia_expression_type(self): def julia_expression_options(self): return jl.NamedTuple() - def load_from(self): - return "file" - class CallableJuliaExpression: def __init__(self, expression): @@ -162,9 +153,6 @@ def julia_expression_options(self): structure = creator(self.function_symbols, f_combine, self.num_features) return jl.seval("NamedTuple{(:structure,)}")((structure,)) - def load_from(self): - return "julia" - def create_exports( self, model: "PySRRegressor", diff --git a/pysr/param_groupings.yml b/pysr/param_groupings.yml index 2c9fa6702..3912e562a 100644 --- a/pysr/param_groupings.yml +++ b/pysr/param_groupings.yml @@ -2,6 +2,7 @@ - Creating the Search Space: - binary_operators - unary_operators + - expression_options - maxsize - maxdepth - Setting the Search Size: diff --git a/pysr/sr.py b/pysr/sr.py index 14ed9fed6..e5e5359b8 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -262,6 +262,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Operators which only take a single scalar as input. For example, `"cos"` or `"exp"`. Default is `None`. + expression_options : AbstractExpressionOptions + The type of expression to search for. By default, + this is just `ExpressionOptions()`. You can also use + `TemplateExpressionOptions(...)` which allows you to specify + a custom template for the expressions. + Default is `ExpressionOptions()`. niterations : int Number of iterations of the algorithm to run. The best equations are printed and migrate between populations at the From 39b58233d503df7702f25a5d996d7ab4fcbfeb25 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 04:49:59 +0000 Subject: [PATCH 28/92] refactor: have default expression_options be None --- pysr/sr.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index e5e5359b8..3d7b31d16 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -749,7 +749,7 @@ def __init__( *, binary_operators: Optional[List[str]] = None, unary_operators: Optional[List[str]] = None, - expression_options: AbstractExpressionOptions = ExpressionOptions(), + expression_options: Optional[AbstractExpressionOptions] = None, niterations: int = 40, populations: int = 15, population_size: int = 33, @@ -1836,6 +1836,8 @@ def _run( optimize=self.weight_optimize, ) + expression_options = self.expression_options or ExpressionOptions() + jl_binary_operators: List[Any] = [] jl_unary_operators: List[Any] = [] for input_list, output_list, name in [ @@ -1861,8 +1863,8 @@ def _run( complexity_of_constants=self.complexity_of_constants, complexity_of_variables=complexity_of_variables, complexity_mapping=self.complexity_mapping, - expression_type=self.expression_options.julia_expression_type(), - expression_options=self.expression_options.julia_expression_options(), + expression_type=expression_options.julia_expression_type(), + expression_options=expression_options.julia_expression_options(), nested_constraints=nested_constraints, elementwise_loss=custom_loss, loss_function=custom_full_objective, @@ -2456,8 +2458,9 @@ def get_hof(self, search_output: Optional[Any] = None): if should_read_from_file: self.equation_file_contents_ = self._read_equation_file() + expression_options = self.expression_options or ExpressionOptions() ret_outputs = [ - self.expression_options.create_exports(self, output, search_output) + expression_options.create_exports(self, output, search_output) for output in self.equation_file_contents_ ] From 5b43b0f935b1093e7f6134483b1e99af846027cb Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 04:50:17 +0000 Subject: [PATCH 29/92] deps: bump backend to 1.0.2 --- pysr/juliapkg.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json index 6a7c97348..0ae4955e0 100644 --- a/pysr/juliapkg.json +++ b/pysr/juliapkg.json @@ -3,7 +3,7 @@ "packages": { "SymbolicRegression": { "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", - "version": "=1.0.1" + "version": "=1.0.2" }, "Serialization": { "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b", From 20e5eef4060606e17f77082ab6e7b6510618b376 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 04:58:20 +0000 Subject: [PATCH 30/92] deps: bump min julia version --- .github/workflows/CI.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index a7a4c3719..07ece9dae 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -30,7 +30,7 @@ jobs: os: [ubuntu-latest] test-id: [main] include: - - julia-version: '1.6' + - julia-version: '1.10' python-version: '3.8' os: ubuntu-latest test-id: include From f9050125a2011314dd1a403e06b5f589455d18dc Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 20:01:00 +0000 Subject: [PATCH 31/92] test: fix mypy error --- pysr/sr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index 3d7b31d16..5ce2fd478 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2461,7 +2461,7 @@ def get_hof(self, search_output: Optional[Any] = None): expression_options = self.expression_options or ExpressionOptions() ret_outputs = [ expression_options.create_exports(self, output, search_output) - for output in self.equation_file_contents_ + for output in cast(List[pd.DataFrame], self.equation_file_contents_) ] if self.nout_ > 1: From 73f3af6f56c86e8a942d152da692476a6f344581 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 20:22:45 +0000 Subject: [PATCH 32/92] refactor: avoid duplicate code --- pysr/export.py | 47 ++++++++++------------------------- pysr/expression_types.py | 53 +++++++++++++++++----------------------- pysr/sr.py | 53 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 83 insertions(+), 70 deletions(-) diff --git a/pysr/export.py b/pysr/export.py index 3042bc73e..b79bbacd7 100644 --- a/pysr/export.py +++ b/pysr/export.py @@ -23,12 +23,12 @@ def add_export_formats( extra_jax_mappings: Optional[Dict[Callable, str]] = None, output_jax_format: bool = False, ) -> pd.DataFrame: + """Create export formats for an equations dataframe. + Returns a new dataframe containing only the exported formats. + """ output = copy.deepcopy(output) - scores = [] - lastMSE = None - lastComplexity = 0 sympy_format = [] lambda_format = [] jax_format = [] @@ -72,38 +72,17 @@ def add_export_formats( ) torch_format.append(module) - curMSE = eqn_row["loss"] - curComplexity = eqn_row["complexity"] + exports = pd.DataFrame( + { + "sympy_format": sympy_format, + "lambda_format": lambda_format, + }, + index=output.index, + ) - if lastMSE is None: - cur_score = 0.0 - else: - if curMSE > 0.0: - # TODO Move this to more obvious function/file. - cur_score = -np.log(curMSE / lastMSE) / (curComplexity - lastComplexity) - else: - cur_score = np.inf - - scores.append(cur_score) - lastMSE = curMSE - lastComplexity = curComplexity - - output["score"] = np.array(scores) - output["sympy_format"] = sympy_format - output["lambda_format"] = lambda_format - output_cols = [ - "complexity", - "loss", - "score", - "equation", - "sympy_format", - "lambda_format", - ] if output_jax_format: - output_cols += ["jax_format"] - output["jax_format"] = jax_format + exports["jax_format"] = jax_format if output_torch_format: - output_cols += ["torch_format"] - output["torch_format"] = torch_format + exports["torch_format"] = torch_format - return output[output_cols] + return exports diff --git a/pysr/expression_types.py b/pysr/expression_types.py index 239e20b55..577f86aac 100644 --- a/pysr/expression_types.py +++ b/pysr/expression_types.py @@ -25,9 +25,8 @@ class AbstractExpressionOptions(ABC): This will get stored as `expression_type` in `SymbolicRegression.Options`. 2. julia_expression_options(): Method to create the expression options, returned as a Julia object. These will get stored as `expression_options` in `SymbolicRegression.Options`. - - You can also optionally implement create_exports(), which will be used to - create the exports of the equations. + 3. create_exports(), which will be used to create the exports of the equations, such as + the executable format, the SymPy format, etc. """ @abstractmethod @@ -40,22 +39,15 @@ def julia_expression_options(self) -> Any: """The expression options""" pass + @abstractmethod def create_exports( self, model: "PySRRegressor", equations: pd.DataFrame, search_output: Any, ) -> pd.DataFrame: - return add_export_formats( - equations, - feature_names_in=model.feature_names_in_, - selection_mask=model.selection_mask_, - extra_sympy_mappings=model.extra_sympy_mappings, - extra_torch_mappings=model.extra_torch_mappings, - output_jax_format=model.output_jax_format, - extra_jax_mappings=model.extra_jax_mappings, - output_torch_format=model.output_torch_format, - ) + """Create additional columns in the equations dataframe.""" + pass class ExpressionOptions(AbstractExpressionOptions): @@ -67,6 +59,23 @@ def julia_expression_type(self): def julia_expression_options(self): return jl.NamedTuple() + def create_exports( + self, + model: "PySRRegressor", + equations: pd.DataFrame, + search_output: Any, + ): + return add_export_formats( + equations, + feature_names_in=model.feature_names_in_, + selection_mask=model.selection_mask_, + extra_sympy_mappings=model.extra_sympy_mappings, + extra_torch_mappings=model.extra_torch_mappings, + output_jax_format=model.output_jax_format, + extra_jax_mappings=model.extra_jax_mappings, + output_torch_format=model.output_torch_format, + ) + class CallableJuliaExpression: def __init__(self, expression): @@ -166,31 +175,13 @@ def create_exports( (_, out_hof) = search_output expressions = [] callables = [] - scores = [] - - lastMSE = None - lastComplexity = 0 for _, row in equations.iterrows(): curComplexity = row["complexity"] - curMSE = row["loss"] expression = out_hof.members[curComplexity - 1].tree expressions.append(expression) callables.append(CallableJuliaExpression(expression)) - if lastMSE is None: - cur_score = 0.0 - else: - if curMSE > 0.0: - # TODO Move this to more obvious function/file. - cur_score = -np.log(curMSE / lastMSE) / ( - curComplexity - lastComplexity - ) - else: - cur_score = np.inf - scores.append(cur_score) - equations["julia_expression"] = expressions equations["lambda_format"] = callables - equations["score"] = np.array(scores) return equations diff --git a/pysr/sr.py b/pysr/sr.py index 5ce2fd478..0ce93cff8 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -24,7 +24,6 @@ from .denoising import denoise, multi_denoise from .deprecated import DEPRECATED_KWARGS -from .export import add_export_formats from .export_latex import ( sympy2latex, sympy2latextable, @@ -2459,10 +2458,20 @@ def get_hof(self, search_output: Optional[Any] = None): self.equation_file_contents_ = self._read_equation_file() expression_options = self.expression_options or ExpressionOptions() - ret_outputs = [ - expression_options.create_exports(self, output, search_output) - for output in cast(List[pd.DataFrame], self.equation_file_contents_) - ] + + ret_outputs = [] + for output in cast(List[pd.DataFrame], self.equation_file_contents_): + # Calculate scores on base dataframe + final_df = pd.concat( + [ + output, + calculate_scores(output), + expression_options.create_exports(self, output, search_output), + ], + axis=1, + ) + + ret_outputs.append(final_df) if self.nout_ > 1: return ret_outputs @@ -2541,6 +2550,40 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str): return chosen_idx +def calculate_scores(df: pd.DataFrame) -> pd.DataFrame: + """Calculate scores for each equation based on loss and complexity. + + Score is defined as the negated derivative of the log-loss with respect to complexity. + A higher score means the equation achieved a much better loss at a slightly higher complexity. + """ + scores = [] + lastMSE = None + lastComplexity = 0 + + for _, row in df.iterrows(): + curMSE = row["loss"] + curComplexity = row["complexity"] + + if lastMSE is None: + cur_score = 0.0 + else: + if curMSE > 0.0: + cur_score = -np.log(curMSE / lastMSE) / (curComplexity - lastComplexity) + else: + cur_score = np.inf + + scores.append(cur_score) + lastMSE = curMSE + lastComplexity = curComplexity + + return pd.DataFrame( + { + "score": np.array(scores), + }, + index=df.index, + ) + + def _mutate_parameter(param_name: str, param_value): if param_name in ["binary_operators", "unary_operators"] and isinstance( param_value, str From 17971cc81842c5ad4b66f56e6514d2d77e7630a7 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 20:36:40 +0000 Subject: [PATCH 33/92] refactor: rename expression options to expression specs --- pysr/__init__.py | 14 +++++------ ...xpression_types.py => expression_specs.py} | 10 ++++---- pysr/sr.py | 24 +++++++++---------- 3 files changed, 24 insertions(+), 24 deletions(-) rename pysr/{expression_types.py => expression_specs.py} (96%) diff --git a/pysr/__init__.py b/pysr/__init__.py index 86336ef11..f955d7259 100644 --- a/pysr/__init__.py +++ b/pysr/__init__.py @@ -7,10 +7,10 @@ from .deprecated import best, best_callable, best_row, best_tex, install, pysr from .export_jax import sympy2jax from .export_torch import sympy2torch -from .expression_types import ( - AbstractExpressionOptions, - ExpressionOptions, - TemplateExpressionOptions, +from .expression_specs import ( + AbstractExpressionSpec, + ExpressionSpec, + TemplateExpressionSpec, ) from .julia_extensions import load_all_packages from .sr import PySRRegressor @@ -27,9 +27,9 @@ "install", "load_all_packages", "PySRRegressor", - "AbstractExpressionOptions", - "ExpressionOptions", - "TemplateExpressionOptions", + "AbstractExpressionSpec", + "ExpressionSpec", + "TemplateExpressionSpec", "best", "best_callable", "best_row", diff --git a/pysr/expression_types.py b/pysr/expression_specs.py similarity index 96% rename from pysr/expression_types.py rename to pysr/expression_specs.py index 577f86aac..aff183d9a 100644 --- a/pysr/expression_types.py +++ b/pysr/expression_specs.py @@ -13,8 +13,8 @@ from .sr import PySRRegressor -class AbstractExpressionOptions(ABC): - """Abstract base class for expression types. +class AbstractExpressionSpec(ABC): + """Abstract base class describing expression types. This basically just holds the options for the expression type, as well as explains how to parse and evaluate them. @@ -50,7 +50,7 @@ def create_exports( pass -class ExpressionOptions(AbstractExpressionOptions): +class ExpressionSpec(AbstractExpressionSpec): """Options for the regular Expression expression type""" def julia_expression_type(self): @@ -88,7 +88,7 @@ def __call__(self, X: np.ndarray): return np.array(raw_output).T -class TemplateExpressionOptions(AbstractExpressionOptions): +class TemplateExpressionSpec(AbstractExpressionSpec): """The structure of a template expression. This class allows you to specify how multiple sub-expressions should be combined @@ -115,7 +115,7 @@ class TemplateExpressionOptions(AbstractExpressionOptions): -------- ```python # Create template that combines f(x1, x2) and g(x3): - template_options = TemplateExpressionOptions( + template_options = TemplateExpressionSpec( function_symbols=["f", "g"], combine="((; f, g), (x1, x2, x3)) -> sin(f(x1, x2)) + g(x3)^2", ) diff --git a/pysr/sr.py b/pysr/sr.py index 0ce93cff8..e98ec0e1c 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -31,7 +31,7 @@ with_preamble, ) from .export_sympy import assert_valid_sympy_symbol -from .expression_types import AbstractExpressionOptions, ExpressionOptions +from .expression_specs import AbstractExpressionSpec, ExpressionSpec from .feature_selection import run_feature_selection from .julia_extensions import load_required_packages from .julia_helpers import ( @@ -261,12 +261,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Operators which only take a single scalar as input. For example, `"cos"` or `"exp"`. Default is `None`. - expression_options : AbstractExpressionOptions + expression_spec : AbstractExpressionSpec The type of expression to search for. By default, - this is just `ExpressionOptions()`. You can also use - `TemplateExpressionOptions(...)` which allows you to specify + this is just `ExpressionSpec()`. You can also use + `TemplateExpressionSpec(...)` which allows you to specify a custom template for the expressions. - Default is `ExpressionOptions()`. + Default is `ExpressionSpec()`. niterations : int Number of iterations of the algorithm to run. The best equations are printed and migrate between populations at the @@ -748,7 +748,7 @@ def __init__( *, binary_operators: Optional[List[str]] = None, unary_operators: Optional[List[str]] = None, - expression_options: Optional[AbstractExpressionOptions] = None, + expression_spec: Optional[AbstractExpressionSpec] = None, niterations: int = 40, populations: int = 15, population_size: int = 33, @@ -841,7 +841,7 @@ def __init__( self.model_selection = model_selection self.binary_operators = binary_operators self.unary_operators = unary_operators - self.expression_options = expression_options + self.expression_spec = expression_spec self.niterations = niterations self.populations = populations self.population_size = population_size @@ -1835,7 +1835,7 @@ def _run( optimize=self.weight_optimize, ) - expression_options = self.expression_options or ExpressionOptions() + expression_spec = self.expression_spec or ExpressionSpec() jl_binary_operators: List[Any] = [] jl_unary_operators: List[Any] = [] @@ -1862,8 +1862,8 @@ def _run( complexity_of_constants=self.complexity_of_constants, complexity_of_variables=complexity_of_variables, complexity_mapping=self.complexity_mapping, - expression_type=expression_options.julia_expression_type(), - expression_options=expression_options.julia_expression_options(), + expression_type=expression_spec.julia_expression_type(), + expression_options=expression_spec.julia_expression_options(), nested_constraints=nested_constraints, elementwise_loss=custom_loss, loss_function=custom_full_objective, @@ -2457,7 +2457,7 @@ def get_hof(self, search_output: Optional[Any] = None): if should_read_from_file: self.equation_file_contents_ = self._read_equation_file() - expression_options = self.expression_options or ExpressionOptions() + expression_spec = self.expression_spec or ExpressionSpec() ret_outputs = [] for output in cast(List[pd.DataFrame], self.equation_file_contents_): @@ -2466,7 +2466,7 @@ def get_hof(self, search_output: Optional[Any] = None): [ output, calculate_scores(output), - expression_options.create_exports(self, output, search_output), + expression_spec.create_exports(self, output, search_output), ], axis=1, ) From 688be6be4109b32640fca3cc74ad0847a5f85bb9 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 20:57:33 +0000 Subject: [PATCH 34/92] feat: add interface methods for declaring sympy/latex/jax/torch support --- pysr/expression_specs.py | 40 ++++++++++++++++++++++++++++++++++++- pysr/sr.py | 43 +++++++++++++++++++++++++++++----------- 2 files changed, 70 insertions(+), 13 deletions(-) diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index aff183d9a..7795dd4f4 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -27,6 +27,10 @@ class AbstractExpressionSpec(ABC): These will get stored as `expression_options` in `SymbolicRegression.Options`. 3. create_exports(), which will be used to create the exports of the equations, such as the executable format, the SymPy format, etc. + + It may also optionally implement: + + - supports_sympy, supports_torch, supports_jax, supports_latex: Whether this expression type supports the corresponding export format. """ @abstractmethod @@ -49,6 +53,22 @@ def create_exports( """Create additional columns in the equations dataframe.""" pass + @property + def supports_sympy(self) -> bool: + return False + + @property + def supports_torch(self) -> bool: + return False + + @property + def supports_jax(self) -> bool: + return False + + @property + def supports_latex(self) -> bool: + return False + class ExpressionSpec(AbstractExpressionSpec): """Options for the regular Expression expression type""" @@ -76,6 +96,22 @@ def create_exports( output_torch_format=model.output_torch_format, ) + @property + def supports_sympy(self): + return True + + @property + def supports_torch(self): + return True + + @property + def supports_jax(self): + return True + + @property + def supports_latex(self): + return True + class CallableJuliaExpression: def __init__(self, expression): @@ -168,7 +204,9 @@ def create_exports( equations: pd.DataFrame, search_output: Any, ) -> pd.DataFrame: - assert search_output is not None + # We try to load the raw julia state from a saved binary stream + # if not provided. + search_output = search_output or model.julia_state_ equations = copy.deepcopy(equations) diff --git a/pysr/sr.py b/pysr/sr.py index e98ec0e1c..24b11571c 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -680,6 +680,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): stored as an array of uint8, produced by Julia's Serialization.serialize function. julia_options_stream_ : ndarray The serialized julia options, stored as an array of uint8, + expression_spec_ : AbstractExpressionSpec + The expression specification used for this fit. This is equal to + `self.expression_spec` if provided, or `ExpressionSpec()` otherwise. equation_file_contents_ : list[pandas.DataFrame] Contents of the equation file output by the Julia backend. show_pickle_warnings_ : bool @@ -1245,6 +1248,10 @@ def raw_julia_state_(self): ) return self.julia_state_ + @property + def expression_spec_(self): + return self.expression_spec or ExpressionSpec() + def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]: """ Get best equation using `model_selection`. @@ -1835,8 +1842,6 @@ def _run( optimize=self.weight_optimize, ) - expression_spec = self.expression_spec or ExpressionSpec() - jl_binary_operators: List[Any] = [] jl_unary_operators: List[Any] = [] for input_list, output_list, name in [ @@ -1862,8 +1867,8 @@ def _run( complexity_of_constants=self.complexity_of_constants, complexity_of_variables=complexity_of_variables, complexity_mapping=self.complexity_mapping, - expression_type=expression_spec.julia_expression_type(), - expression_options=expression_spec.julia_expression_options(), + expression_type=self.expression_spec_.julia_expression_type(), + expression_options=self.expression_spec_.julia_expression_options(), nested_constraints=nested_constraints, elementwise_loss=custom_loss, loss_function=custom_full_objective, @@ -2309,6 +2314,10 @@ def latex(self, index=None, precision=3): best_equation : str or list[str] of length nout_ LaTeX expression of the best equation. """ + if not self.expression_spec_.supports_latex: + raise ValueError( + f"`expression_spec={self.expression_spec_}` does not support latex export." + ) self.refresh() sympy_representation = self.sympy(index=index) if self.nout_ > 1: @@ -2342,6 +2351,10 @@ def jax(self, index=None): Dictionary of callable jax function in "callable" key, and jax array of parameters as "parameters" key. """ + if not self.expression_spec_.supports_jax: + raise ValueError( + f"`expression_spec={self.expression_spec_}` does not support jax export." + ) self.set_params(output_jax_format=True) self.refresh() best_equation = self.get_best(index=index) @@ -2374,6 +2387,10 @@ def pytorch(self, index=None): best_equation : torch.nn.Module PyTorch module representing the expression. """ + if not self.expression_spec_.supports_torch: + raise ValueError( + f"`expression_spec={self.expression_spec_}` does not support torch export." + ) self.set_params(output_torch_format=True) self.refresh() best_equation = self.get_best(index=index) @@ -2457,21 +2474,19 @@ def get_hof(self, search_output: Optional[Any] = None): if should_read_from_file: self.equation_file_contents_ = self._read_equation_file() - expression_spec = self.expression_spec or ExpressionSpec() + equation_file_contents = cast(List[pd.DataFrame], self.equation_file_contents_) - ret_outputs = [] - for output in cast(List[pd.DataFrame], self.equation_file_contents_): - # Calculate scores on base dataframe - final_df = pd.concat( + ret_outputs = [ + pd.concat( [ output, calculate_scores(output), - expression_spec.create_exports(self, output, search_output), + self.expression_spec_.create_exports(self, output, search_output), ], axis=1, ) - - ret_outputs.append(final_df) + for output in equation_file_contents + ] if self.nout_ > 1: return ret_outputs @@ -2505,6 +2520,10 @@ def latex_table( latex_table_str : str A string that will render a table in LaTeX of the equations. """ + if not self.expression_spec_.supports_latex: + raise ValueError( + f"`expression_spec={self.expression_spec_}` does not support latex export." + ) self.refresh() if isinstance(self.equations_, list): From 1eebc08fa81fa7eef9f1bcf06655dbf83647942b Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 21:01:46 +0000 Subject: [PATCH 35/92] refactor: move out julia expression code --- pysr/expression_specs.py | 50 ++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index 7795dd4f4..430bb2de8 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -113,17 +113,6 @@ def supports_latex(self): return True -class CallableJuliaExpression: - def __init__(self, expression): - self.expression = expression - - def __call__(self, X: np.ndarray): - if not isinstance(X, np.ndarray): - raise ValueError("X must be a numpy array") - raw_output = self.expression(jl_array(X.T)) - return np.array(raw_output).T - - class TemplateExpressionSpec(AbstractExpressionSpec): """The structure of a template expression. @@ -207,19 +196,34 @@ def create_exports( # We try to load the raw julia state from a saved binary stream # if not provided. search_output = search_output or model.julia_state_ + return _search_output_to_callable_expressions(equations, search_output) + + +class CallableJuliaExpression: + def __init__(self, expression): + self.expression = expression + + def __call__(self, X: np.ndarray): + if not isinstance(X, np.ndarray): + raise ValueError("X must be a numpy array") + raw_output = self.expression(jl_array(X.T)) + return np.array(raw_output).T - equations = copy.deepcopy(equations) - (_, out_hof) = search_output - expressions = [] - callables = [] +def _search_output_to_callable_expressions( + equations: pd.DataFrame, search_output +) -> pd.DataFrame: + equations = copy.deepcopy(equations) + (_, out_hof) = search_output + expressions = [] + callables = [] - for _, row in equations.iterrows(): - curComplexity = row["complexity"] - expression = out_hof.members[curComplexity - 1].tree - expressions.append(expression) - callables.append(CallableJuliaExpression(expression)) + for _, row in equations.iterrows(): + curComplexity = row["complexity"] + expression = out_hof.members[curComplexity - 1].tree + expressions.append(expression) + callables.append(CallableJuliaExpression(expression)) - equations["julia_expression"] = expressions - equations["lambda_format"] = callables - return equations + equations["julia_expression"] = expressions + equations["lambda_format"] = callables + return equations From b05f8abc1e0ce4e6bea0316a03761089997b0aa0 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 23:12:17 +0000 Subject: [PATCH 36/92] feat: ensure `predict` uses same dtype as fit --- pysr/expression_specs.py | 8 +++++--- pysr/sr.py | 16 ++++++++++------ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index 430bb2de8..643f0edec 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -224,6 +224,8 @@ def _search_output_to_callable_expressions( expressions.append(expression) callables.append(CallableJuliaExpression(expression)) - equations["julia_expression"] = expressions - equations["lambda_format"] = callables - return equations + df = pd.DataFrame( + {"julia_expression": expressions, "lambda_format": callables}, + index=equations.index, + ) + return df diff --git a/pysr/sr.py b/pysr/sr.py index 24b11571c..3f2528b3e 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1586,6 +1586,14 @@ def _validate_data_X(self, X) -> Tuple[ndarray]: raw_out = self._validate_data(X=X, reset=False) # type: ignore return cast(Tuple[ndarray], raw_out) + def _get_precision_mapped_dtype(self, X: np.ndarray) -> np.dtype: + is_complex = np.issubdtype(X.dtype, np.complexfloating) + is_real = not is_complex + if is_real: + return {16: np.float16, 32: np.float32, 64: np.float64}[self.precision] + else: + return {32: np.complex64, 64: np.complex128}[self.precision] + def _pre_transform_training_data( self, X: ndarray, @@ -1924,12 +1932,7 @@ def _run( # Convert data to desired precision test_X = np.array(X) - is_complex = np.issubdtype(test_X.dtype, np.complexfloating) - is_real = not is_complex - if is_real: - np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[self.precision] - else: - np_dtype = {32: np.complex64, 64: np.complex128}[self.precision] + np_dtype = self._get_precision_mapped_dtype(test_X) # This converts the data into a Julia array: jl_X = jl_array(np.array(X, dtype=np_dtype).T) @@ -2248,6 +2251,7 @@ def predict(self, X, index=None): # feature selected) X in fit. X = X.reindex(columns=self.feature_names_in_) X = self._validate_data_X(X) + X = X.astype(self._get_precision_mapped_dtype(X)) try: if isinstance(best_equation, list): From 02613dab90c87d2539921c4ec29f1773bf1171c7 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 24 Nov 2024 23:55:02 +0000 Subject: [PATCH 37/92] feat: introduce `ParametricExpressionSpec` --- pysr/__init__.py | 2 ++ pysr/expression_specs.py | 24 +++++++++++++++++-- pysr/param_groupings.yml | 2 +- pysr/sr.py | 51 ++++++++++++++++++++++++++++++++++++---- pysr/test/test.py | 1 + 5 files changed, 72 insertions(+), 8 deletions(-) diff --git a/pysr/__init__.py b/pysr/__init__.py index f955d7259..0f4497bfb 100644 --- a/pysr/__init__.py +++ b/pysr/__init__.py @@ -10,6 +10,7 @@ from .expression_specs import ( AbstractExpressionSpec, ExpressionSpec, + ParametricExpressionSpec, TemplateExpressionSpec, ) from .julia_extensions import load_all_packages @@ -30,6 +31,7 @@ "AbstractExpressionSpec", "ExpressionSpec", "TemplateExpressionSpec", + "ParametricExpressionSpec", "best", "best_callable", "best_row", diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index 643f0edec..266779881 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -199,14 +199,34 @@ def create_exports( return _search_output_to_callable_expressions(equations, search_output) +class ParametricExpressionSpec(AbstractExpressionSpec): + def __init__(self, max_parameters: int): + self.max_parameters = max_parameters + + def julia_expression_type(self): + return SymbolicRegression.ParametricExpression + + def julia_expression_options(self): + return jl.seval("NamedTuple{(:max_parameters,)}")((self.max_parameters,)) + + def create_exports( + self, + model: "PySRRegressor", + equations: pd.DataFrame, + search_output: Any, + ): + search_output = search_output or model.julia_state_ + return _search_output_to_callable_expressions(equations, search_output) + + class CallableJuliaExpression: def __init__(self, expression): self.expression = expression - def __call__(self, X: np.ndarray): + def __call__(self, X: np.ndarray, *args): if not isinstance(X, np.ndarray): raise ValueError("X must be a numpy array") - raw_output = self.expression(jl_array(X.T)) + raw_output = self.expression(jl_array(X.T), *args) return np.array(raw_output).T diff --git a/pysr/param_groupings.yml b/pysr/param_groupings.yml index 3912e562a..bce623a7a 100644 --- a/pysr/param_groupings.yml +++ b/pysr/param_groupings.yml @@ -2,7 +2,7 @@ - Creating the Search Space: - binary_operators - unary_operators - - expression_options + - expression_spec - maxsize - maxdepth - Setting the Search Size: diff --git a/pysr/sr.py b/pysr/sr.py index 3f2528b3e..eab007229 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -31,7 +31,11 @@ with_preamble, ) from .export_sympy import assert_valid_sympy_symbol -from .expression_specs import AbstractExpressionSpec, ExpressionSpec +from .expression_specs import ( + AbstractExpressionSpec, + ExpressionSpec, + ParametricExpressionSpec, +) from .feature_selection import run_feature_selection from .julia_extensions import load_required_packages from .julia_helpers import ( @@ -1715,6 +1719,7 @@ def _run( y: ndarray, runtime_params: _DynamicallySetParams, weights: Optional[ndarray], + category: Optional[ndarray], seed: int, ): """ @@ -1733,6 +1738,10 @@ def _run( Weight array of the same shape as `y`. Each element is how to weight the mean-square-error loss for that particular element of y. + category : ndarray | None + If `expression_spec` is a `ParametricExpressionSpec`, then this + argument should be a list of integers representing the category + of each sample in `X`. seed : int Random seed for julia backend process. @@ -1948,6 +1957,15 @@ def _run( else: jl_weights = None + if category is not None: + offset_for_julia_indexing = 1 + jl_category = jl_array( + (category + offset_for_julia_indexing).astype(np.int64) + ) + jl_extra = jl.seval("NamedTuple{(:class,)}")((jl_category,)) + else: + jl_extra = jl.NamedTuple() + if self.procs == 0 and not multithreading: parallelism = "serial" elif multithreading: @@ -1973,6 +1991,7 @@ def _run( jl_X, jl_y, weights=jl_weights, + extra=jl_extra, niterations=int(self.niterations), variable_names=jl_array([str(v) for v in self.feature_names_in_]), display_variable_names=jl_array( @@ -2011,6 +2030,7 @@ def fit( self, X, y, + *, Xresampled=None, weights=None, variable_names: Optional[ArrayLike[str]] = None, @@ -2019,6 +2039,7 @@ def fit( ] = None, X_units: Optional[ArrayLike[str]] = None, y_units: Optional[Union[str, ArrayLike[str]]] = None, + category: Optional[ArrayLike[int]] = None, ) -> "PySRRegressor": """ Search for equations to fit the dataset and store them in `self.equations_`. @@ -2055,6 +2076,10 @@ def fit( Similar to `X_units`, but as a unit for the target variable, `y`. If `y` is a matrix, a list of units should be passed. If `X_units` is given but `y_units` is not, then `y_units` will be arbitrary. + category : list[int] + If `expression_spec` is a `ParametricExpressionSpec`, then this + argument should be a list of integers representing the category + of each sample. Returns ------- @@ -2086,6 +2111,13 @@ def fit( runtime_params = self._validate_and_modify_params() + if category is not None: + assert Xresampled is None + + if isinstance(self.expression_spec, ParametricExpressionSpec): + assert category is not None + + # TODO: Put `category` here ( X, y, @@ -2165,7 +2197,7 @@ def fit( self._checkpoint() # Perform the search: - self._run(X, y, runtime_params, weights=weights, seed=seed) + self._run(X, y, runtime_params, weights=weights, seed=seed, category=category) # Then, after fit, we save again, so the pickle file contains # the equations: @@ -2194,7 +2226,7 @@ def refresh(self, run_directory: Optional[str] = None) -> None: check_is_fitted(self, attributes=["run_id_", "output_directory_"]) self.equations_ = self.get_hof() - def predict(self, X, index=None): + def predict(self, X, index=None, *, category: Optional[ndarray] = None): """ Predict y from input X using the equation chosen by `model_selection`. @@ -2210,6 +2242,10 @@ def predict(self, X, index=None): particular row of `self.equations_`, you may specify the index here. For multiple output equations, you must pass a list of indices in the same order. + category : ndarray | None + If `expression_spec` is a `ParametricExpressionSpec`, then this + argument should be a list of integers representing the category + of each sample in `X`. Returns ------- @@ -2252,15 +2288,20 @@ def predict(self, X, index=None): X = X.reindex(columns=self.feature_names_in_) X = self._validate_data_X(X) X = X.astype(self._get_precision_mapped_dtype(X)) + if category is not None: + offset_for_julia_indexing = 1 + args = (jl_array((category + offset_for_julia_indexing).astype(np.int64)),) + else: + args = () try: if isinstance(best_equation, list): assert self.nout_ > 1 return np.stack( - [eq["lambda_format"](X) for eq in best_equation], axis=1 + [eq["lambda_format"](X, *args) for eq in best_equation], axis=1 ) else: - return best_equation["lambda_format"](X) + return best_equation["lambda_format"](X, *args) except Exception as error: raise ValueError( "Failed to evaluate the expression. " diff --git a/pysr/test/test.py b/pysr/test/test.py index 3d394a0bc..89d465bd4 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -456,6 +456,7 @@ def test_load_model(self): feature_names_in=["f0", "f1", "f2", "f3", "f4"], binary_operators=["+", "*", "/", "-", "^"], unary_operators=["cos"], + precision=64, ) X = self.rstate.rand(100, 5) y_truth = 2.2683423 ** np.cos(X[:, 3]) From 492ae579a760566570d7d3861857da7290e09be2 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 25 Nov 2024 00:29:36 +0000 Subject: [PATCH 38/92] test: fix mypy errors --- pysr/sr.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index eab007229..577e33b6c 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -11,7 +11,18 @@ from io import StringIO from multiprocessing import cpu_count from pathlib import Path -from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, cast +from typing import ( + Any, + Callable, + Dict, + List, + Literal, + Optional, + Tuple, + Type, + Union, + cast, +) import numpy as np import pandas as pd @@ -1590,7 +1601,7 @@ def _validate_data_X(self, X) -> Tuple[ndarray]: raw_out = self._validate_data(X=X, reset=False) # type: ignore return cast(Tuple[ndarray], raw_out) - def _get_precision_mapped_dtype(self, X: np.ndarray) -> np.dtype: + def _get_precision_mapped_dtype(self, X: np.ndarray) -> Type: is_complex = np.issubdtype(X.dtype, np.complexfloating) is_real = not is_complex if is_real: @@ -2039,7 +2050,7 @@ def fit( ] = None, X_units: Optional[ArrayLike[str]] = None, y_units: Optional[Union[str, ArrayLike[str]]] = None, - category: Optional[ArrayLike[int]] = None, + category: Optional[ndarray] = None, ) -> "PySRRegressor": """ Search for equations to fit the dataset and store them in `self.equations_`. @@ -2290,7 +2301,9 @@ def predict(self, X, index=None, *, category: Optional[ndarray] = None): X = X.astype(self._get_precision_mapped_dtype(X)) if category is not None: offset_for_julia_indexing = 1 - args = (jl_array((category + offset_for_julia_indexing).astype(np.int64)),) + args: tuple = ( + jl_array((category + offset_for_julia_indexing).astype(np.int64)), + ) else: args = () From 4bada83856fe76cd2f469e28b25670ff182717fd Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 25 Nov 2024 00:42:22 +0000 Subject: [PATCH 39/92] refactor: clean up redundant lines --- pysr/sr.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 577e33b6c..15b9ad808 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1774,13 +1774,7 @@ def _run( # specified in init, so we define them here locally: binary_operators = runtime_params.binary_operators unary_operators = runtime_params.unary_operators - maxdepth = runtime_params.maxdepth constraints = runtime_params.constraints - multithreading = runtime_params.multithreading - batch_size = runtime_params.batch_size - update_verbosity = runtime_params.update_verbosity - progress = runtime_params.progress - warmup_maxsize_by = runtime_params.warmup_maxsize_by nested_constraints = self.nested_constraints complexity_of_operators = self.complexity_of_operators @@ -1788,7 +1782,7 @@ def _run( cluster_manager = self.cluster_manager # Start julia backend processes - if not ALREADY_RAN and update_verbosity != 0: + if not ALREADY_RAN and runtime_params.update_verbosity != 0: print("Compiling Julia backend...") if cluster_manager is not None: @@ -1904,7 +1898,9 @@ def _run( output_directory=_escape_filename(self.output_directory_), npopulations=int(self.populations), batching=self.batching, - batch_size=int(min([batch_size, len(X)]) if self.batching else len(X)), + batch_size=int( + min([runtime_params.batch_size, len(X)]) if self.batching else len(X) + ), mutation_weights=mutation_weights, tournament_selection_p=self.tournament_selection_p, tournament_selection_n=self.tournament_selection_n, @@ -1913,7 +1909,7 @@ def _run( dimensional_constraint_penalty=self.dimensional_constraint_penalty, dimensionless_constants_only=self.dimensionless_constants_only, alpha=self.alpha, - maxdepth=maxdepth, + maxdepth=runtime_params.maxdepth, fast_cycle=self.fast_cycle, turbo=self.turbo, bumper=self.bumper, @@ -1923,7 +1919,7 @@ def _run( fraction_replaced_hof=self.fraction_replaced_hof, should_simplify=self.should_simplify, should_optimize_constants=self.should_optimize_constants, - warmup_maxsize_by=warmup_maxsize_by, + warmup_maxsize_by=runtime_params.warmup_maxsize_by, use_frequency=self.use_frequency, use_frequency_in_tournament=self.use_frequency_in_tournament, adaptive_parsimony_scaling=self.adaptive_parsimony_scaling, @@ -1977,9 +1973,9 @@ def _run( else: jl_extra = jl.NamedTuple() - if self.procs == 0 and not multithreading: + if self.procs == 0 and not runtime_params.multithreading: parallelism = "serial" - elif multithreading: + elif runtime_params.multithreading: parallelism = "multithreading" else: parallelism = "multiprocessing" @@ -2023,7 +2019,9 @@ def _run( run_id=self.run_id_, addprocs_function=cluster_manager, heap_size_hint_in_bytes=self.heap_size_hint_in_bytes, - progress=progress and self.verbosity > 0 and len(y.shape) == 1, + progress=runtime_params.progress + and self.verbosity > 0 + and len(y.shape) == 1, verbosity=int(self.verbosity), ) PythonCall.GC.enable() From 1196b766f998b3ffda589e263c6d174b0dad56f9 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 25 Nov 2024 00:52:26 +0000 Subject: [PATCH 40/92] refactor: more type stubs --- pysr/sr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index 15b9ad808..5b2d7bdf0 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -830,7 +830,7 @@ def __init__( fast_cycle: bool = False, turbo: bool = False, bumper: bool = False, - precision: int = 32, + precision: Literal[16, 32, 64] = 32, enable_autodiff: bool = False, random_state=None, deterministic: bool = False, From eea890005c607dd4a2e2a0bbbc0269513d9f792c Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 25 Nov 2024 09:01:32 +0000 Subject: [PATCH 41/92] refactor: add more typing information --- pysr/julia_helpers.py | 10 +++-- pysr/julia_import.py | 2 + pysr/sr.py | 94 +++++++++++++++++++++++++++---------------- 3 files changed, 68 insertions(+), 38 deletions(-) diff --git a/pysr/julia_helpers.py b/pysr/julia_helpers.py index 21822fb8a..a93b6265d 100644 --- a/pysr/julia_helpers.py +++ b/pysr/julia_helpers.py @@ -1,13 +1,13 @@ """Functions for initializing the Julia environment and installing deps.""" -from typing import Any, Callable, Union, cast +from typing import Any, Callable, cast, overload import numpy as np from juliacall import convert as jl_convert # type: ignore from numpy.typing import NDArray from .deprecated import init_julia, install -from .julia_import import jl +from .julia_import import AnyValue, jl jl_convert = cast(Callable[[Any, Any], Any], jl_convert) @@ -53,7 +53,11 @@ def jl_serialize(obj: Any) -> NDArray[np.uint8]: return np.array(jl.take_b(buf)) -def jl_deserialize(s: Union[NDArray[np.uint8], None]): +@overload +def jl_deserialize(s: NDArray[np.uint8]) -> AnyValue: ... +@overload +def jl_deserialize(s: None) -> None: ... +def jl_deserialize(s): if s is None: return s buf = jl.IOBuffer() diff --git a/pysr/julia_import.py b/pysr/julia_import.py index 0e032bee1..4d7b91504 100644 --- a/pysr/julia_import.py +++ b/pysr/julia_import.py @@ -42,6 +42,8 @@ # Deprecated; so just pass to juliacall os.environ["PYTHON_JULIACALL_AUTOLOAD_IPYTHON_EXTENSION"] = autoload_extensions +from juliacall import AnyValue # type: ignore +from juliacall import VectorValue # type: ignore from juliacall import Main as jl # type: ignore jl = cast(ModuleType, jl) diff --git a/pysr/sr.py b/pysr/sr.py index 5b2d7bdf0..a9b5b31cc 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -58,7 +58,7 @@ jl_is_function, jl_serialize, ) -from .julia_import import SymbolicRegression, jl +from .julia_import import AnyValue, SymbolicRegression, VectorValue, jl from .utils import ( ArrayLike, _preprocess_julia_floats, @@ -70,7 +70,11 @@ ALREADY_RAN = False -def _process_constraints(binary_operators, unary_operators, constraints): +def _process_constraints( + binary_operators: List[str], + unary_operators: List[Union[Any, str]], + constraints: Dict[str, Union[int, Tuple[int, int]]], +) -> Dict[str, Union[int, Tuple[int, int]]]: constraints = constraints.copy() for op in unary_operators: if op not in constraints: @@ -88,27 +92,28 @@ def _process_constraints(binary_operators, unary_operators, constraints): "For more tips, please see https://ai.damtp.cam.ac.uk/pysr/tuning/" ) constraints[op] = (-1, -1) + + constraint_tuple = cast(Tuple[int, int], constraints[op]) if op in ["plus", "sub", "+", "-"]: - if constraints[op][0] != constraints[op][1]: + if constraint_tuple[0] != constraint_tuple[1]: raise NotImplementedError( "You need equal constraints on both sides for - and +, " "due to simplification strategies." ) elif op in ["mult", "*"]: # Make sure the complex expression is in the left side. - if constraints[op][0] == -1: + if constraint_tuple[0] == -1: continue - if constraints[op][1] == -1 or constraints[op][0] < constraints[op][1]: - constraints[op][0], constraints[op][1] = ( - constraints[op][1], - constraints[op][0], - ) + if constraint_tuple[1] == -1 or constraint_tuple[0] < constraint_tuple[1]: + constraints[op] = (constraint_tuple[1], constraint_tuple[0]) return constraints def _maybe_create_inline_operators( - binary_operators, unary_operators, extra_sympy_mappings -): + binary_operators: List[str], + unary_operators: List[str], + extra_sympy_mappings: Optional[Dict[str, Callable]], +) -> Tuple[List[str], List[str]]: binary_operators = binary_operators.copy() unary_operators = unary_operators.copy() for op_list in [binary_operators, unary_operators]: @@ -229,7 +234,7 @@ class _DynamicallySetParams: binary_operators: List[str] unary_operators: List[str] maxdepth: int - constraints: Dict[str, str] + constraints: Dict[str, Union[int, Tuple[int, int]]] multithreading: bool batch_size: int update_verbosity: int @@ -1002,7 +1007,7 @@ def __init__( @classmethod def from_file( cls, - equation_file=None, + equation_file: None = None, # Deprecated *, run_directory: str, binary_operators: Optional[List[str]] = None, @@ -1012,7 +1017,7 @@ def from_file( selection_mask: Optional[NDArray[np.bool_]] = None, nout: int = 1, **pysr_kwargs, - ): + ) -> "PySRRegressor": """ Create a model from a saved model checkpoint or equation file. @@ -1065,7 +1070,7 @@ def from_file( assert unary_operators is None assert n_features_in is None with open(pkl_filename, "rb") as f: - model = pkl.load(f) + model: "PySRRegressor" = pkl.load(f) # Update any parameters if necessary, such as # extra_sympy_mappings: @@ -1118,7 +1123,7 @@ def from_file( return model - def __repr__(self): + def __repr__(self) -> str: """ Print all current equations fitted by the model. @@ -1165,7 +1170,7 @@ def __repr__(self): output += "]" return output - def __getstate__(self): + def __getstate__(self) -> Dict[str, Any]: """ Handle pickle serialization for PySRRegressor. @@ -1251,7 +1256,10 @@ def julia_options_(self): @property def julia_state_(self): """The deserialized state.""" - return jl_deserialize(self.julia_state_stream_) + return cast( + Optional[Tuple[VectorValue, AnyValue]], + jl_deserialize(self.julia_state_stream_), + ) @property def raw_julia_state_(self): @@ -1267,7 +1275,9 @@ def raw_julia_state_(self): def expression_spec_(self): return self.expression_spec or ExpressionSpec() - def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]: + def get_best( + self, index: Optional[Union[int, List[int]]] = None + ) -> Union[pd.Series, List[pd.Series]]: """ Get best equation using `model_selection`. @@ -1593,11 +1603,11 @@ def _validate_and_set_fit_params( y_units, ) - def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]: + def _validate_data_X_y(self, X: Any, y: Any) -> Tuple[ndarray, ndarray]: raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore return cast(Tuple[ndarray, ndarray], raw_out) - def _validate_data_X(self, X) -> Tuple[ndarray]: + def _validate_data_X(self, X: Any) -> Tuple[ndarray]: raw_out = self._validate_data(X=X, reset=False) # type: ignore return cast(Tuple[ndarray], raw_out) @@ -1795,13 +1805,13 @@ def _run( extra_sympy_mappings=self.extra_sympy_mappings, ) if constraints is not None: - constraints = _process_constraints( + _constraints = _process_constraints( binary_operators=binary_operators, unary_operators=unary_operators, constraints=constraints, ) - una_constraints = [constraints[op] for op in unary_operators] - bin_constraints = [constraints[op] for op in binary_operators] + una_constraints = [_constraints[op] for op in unary_operators] + bin_constraints = [_constraints[op] for op in binary_operators] else: una_constraints = None bin_constraints = None @@ -2235,7 +2245,13 @@ def refresh(self, run_directory: Optional[str] = None) -> None: check_is_fitted(self, attributes=["run_id_", "output_directory_"]) self.equations_ = self.get_hof() - def predict(self, X, index=None, *, category: Optional[ndarray] = None): + def predict( + self, + X, + index: Optional[Union[int, List[int]]] = None, + *, + category: Optional[ndarray] = None, + ) -> ndarray: """ Predict y from input X using the equation chosen by `model_selection`. @@ -2309,10 +2325,14 @@ def predict(self, X, index=None, *, category: Optional[ndarray] = None): if isinstance(best_equation, list): assert self.nout_ > 1 return np.stack( - [eq["lambda_format"](X, *args) for eq in best_equation], axis=1 + [ + cast(ndarray, eq["lambda_format"](X, *args)) + for eq in best_equation + ], + axis=1, ) else: - return best_equation["lambda_format"](X, *args) + return cast(ndarray, best_equation["lambda_format"](X, *args)) except Exception as error: raise ValueError( "Failed to evaluate the expression. " @@ -2322,7 +2342,7 @@ def predict(self, X, index=None, *, category: Optional[ndarray] = None): "You can then run `model.refresh()` to re-load the expressions." ) from error - def sympy(self, index=None): + def sympy(self, index: Optional[Union[int, List[int]]] = None): """ Return sympy representation of the equation(s) chosen by `model_selection`. @@ -2348,7 +2368,9 @@ def sympy(self, index=None): else: return best_equation["sympy_format"] - def latex(self, index=None, precision=3): + def latex( + self, index: Optional[Union[int, List[int]]] = None, precision: int = 3 + ) -> Union[str, List[str]]: """ Return latex representation of the equation(s) chosen by `model_selection`. @@ -2465,7 +2487,7 @@ def get_equation_file(self, i: Optional[int] = None) -> Path: else: return Path(self.output_directory_) / self.run_id_ / "hall_of_fame.csv" - def _read_equation_file(self): + def _read_equation_file(self) -> List[pd.DataFrame]: """Read the hall of fame file created by `SymbolicRegression.jl`.""" try: @@ -2507,7 +2529,9 @@ def _postprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: return df - def get_hof(self, search_output: Optional[Any] = None): + def get_hof( + self, search_output: Optional[Any] = None + ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """Get the equations from a hall of fame file or search output. If no arguments entered, the ones used @@ -2550,10 +2574,10 @@ def get_hof(self, search_output: Optional[Any] = None): def latex_table( self, - indices=None, - precision=3, - columns=["equation", "complexity", "loss", "score"], - ): + indices: Optional[List[int]] = None, + precision: int = 3, + columns: List[str] = ["equation", "complexity", "loss", "score"], + ) -> str: """Create a LaTeX/booktabs table for all, or some, of the equations. Parameters From 82eef258d08e3193fa233e175def6a71e9aad575 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 25 Nov 2024 13:21:59 +0000 Subject: [PATCH 42/92] refactor: further improvements to typing --- pysr/expression_specs.py | 21 +++++++++++++-------- pysr/sr.py | 11 ++++++----- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index 266779881..1e56880c6 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -1,17 +1,22 @@ import copy from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, NewType, Optional, TypeAlias import numpy as np import pandas as pd from .export import add_export_formats from .julia_helpers import jl_array -from .julia_import import SymbolicRegression, jl +from .julia_import import AnyValue, SymbolicRegression, jl +# For type checking purposes if TYPE_CHECKING: from .sr import PySRRegressor + PySRRegressor: TypeAlias = PySRRegressor +else: + PySRRegressor = NewType("PySRRegressor", Any) + class AbstractExpressionSpec(ABC): """Abstract base class describing expression types. @@ -34,19 +39,19 @@ class AbstractExpressionSpec(ABC): """ @abstractmethod - def julia_expression_type(self) -> Any: + def julia_expression_type(self) -> AnyValue: """The expression type""" pass @abstractmethod - def julia_expression_options(self) -> Any: + def julia_expression_options(self) -> AnyValue: """The expression options""" pass @abstractmethod def create_exports( self, - model: "PySRRegressor", + model: PySRRegressor, equations: pd.DataFrame, search_output: Any, ) -> pd.DataFrame: @@ -81,7 +86,7 @@ def julia_expression_options(self): def create_exports( self, - model: "PySRRegressor", + model: PySRRegressor, equations: pd.DataFrame, search_output: Any, ): @@ -189,7 +194,7 @@ def julia_expression_options(self): def create_exports( self, - model: "PySRRegressor", + model: PySRRegressor, equations: pd.DataFrame, search_output: Any, ) -> pd.DataFrame: @@ -211,7 +216,7 @@ def julia_expression_options(self): def create_exports( self, - model: "PySRRegressor", + model: PySRRegressor, equations: pd.DataFrame, search_output: Any, ): diff --git a/pysr/sr.py b/pysr/sr.py index a9b5b31cc..cff837cc1 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -61,6 +61,7 @@ from .julia_import import AnyValue, SymbolicRegression, VectorValue, jl from .utils import ( ArrayLike, + PathLike, _preprocess_julia_floats, _safe_check_feature_names_in, _subscriptify, @@ -1009,7 +1010,7 @@ def from_file( cls, equation_file: None = None, # Deprecated *, - run_directory: str, + run_directory: PathLike, binary_operators: Optional[List[str]] = None, unary_operators: Optional[List[str]] = None, n_features_in: Optional[int] = None, @@ -1468,7 +1469,7 @@ def _validate_and_set_fit_params( Optional[ndarray], Optional[ndarray], ArrayLike[str], - Union[int, float, List[Union[int, float]]], + Optional[Union[int, float, List[Union[int, float]]]], Optional[ArrayLike[str]], Optional[Union[str, ArrayLike[str]]], ]: @@ -1607,9 +1608,9 @@ def _validate_data_X_y(self, X: Any, y: Any) -> Tuple[ndarray, ndarray]: raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore return cast(Tuple[ndarray, ndarray], raw_out) - def _validate_data_X(self, X: Any) -> Tuple[ndarray]: + def _validate_data_X(self, X: Any) -> ndarray: raw_out = self._validate_data(X=X, reset=False) # type: ignore - return cast(Tuple[ndarray], raw_out) + return cast(ndarray, raw_out) def _get_precision_mapped_dtype(self, X: np.ndarray) -> Type: is_complex = np.issubdtype(X.dtype, np.complexfloating) @@ -2225,7 +2226,7 @@ def fit( return self - def refresh(self, run_directory: Optional[str] = None) -> None: + def refresh(self, run_directory: Optional[PathLike] = None) -> None: """ Update self.equations_ with any new options passed. From 78b3b0a7719a39e1e458c94ddcccaceb75817f32 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Tue, 26 Nov 2024 12:20:15 +0000 Subject: [PATCH 43/92] refactor: reduce julia compilation --- pysr/expression_specs.py | 6 +++--- pysr/sr.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index 1e56880c6..d60aa2532 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -185,12 +185,12 @@ def julia_expression_options(self): else (; num_features...) end - return SymbolicRegression.TemplateStructure{tuple_symbol}(combine, num_features) + structure = SymbolicRegression.TemplateStructure{tuple_symbol}(combine, num_features) + return (; structure) end """ ) - structure = creator(self.function_symbols, f_combine, self.num_features) - return jl.seval("NamedTuple{(:structure,)}")((structure,)) + return creator(self.function_symbols, f_combine, self.num_features) def create_exports( self, diff --git a/pysr/sr.py b/pysr/sr.py index cff837cc1..7fb7da013 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1071,7 +1071,7 @@ def from_file( assert unary_operators is None assert n_features_in is None with open(pkl_filename, "rb") as f: - model: "PySRRegressor" = pkl.load(f) + model: "pysr.sr.PySRRegressor" = pkl.load(f) # Update any parameters if necessary, such as # extra_sympy_mappings: From 93f9df7fc95c38abed5ebc210dcead99114b5406 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Tue, 26 Nov 2024 13:06:33 +0000 Subject: [PATCH 44/92] test: TemplateExpressionSpec for out-of-domain data --- pysr/test/test.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/pysr/test/test.py b/pysr/test/test.py index 89d465bd4..617d0c429 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -12,7 +12,7 @@ import sympy # type: ignore from sklearn.utils.estimator_checks import check_estimator -from pysr import PySRRegressor, install, jl, load_all_packages +from pysr import PySRRegressor, TemplateExpressionSpec, install, jl, load_all_packages from pysr.export_latex import sympy2latex from pysr.feature_selection import _handle_feature_selection, run_feature_selection from pysr.julia_helpers import init_julia @@ -513,6 +513,35 @@ def test_jl_function_error(self): str(cm.exception), ) + def test_template_sin_addition(self): + # Create random data between -1 and 1 + X = self.rstate.uniform(-1, 1, (100, 2)) + + # Ground truth: sin(x + y) + y = np.sin(X[:, 0] + X[:, 1]) + + # Create model with template that includes the missing sin operator + model = PySRRegressor( + expression_spec=TemplateExpressionSpec( + ["f"], "((; f), (x, y)) -> sin(f(x, y))" + ), + binary_operators=["+", "-", "*", "/"], + unary_operators=[], # No sin operator! + maxsize=10, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 3", + **self.default_test_kwargs, + ) + + model.fit(X, y) + + # Test on out of domain data - this should still work due to sin template! + X_test = self.rstate.uniform(2, 10, (25, 2)) + y_test = np.sin(X_test[:, 0] + X_test[:, 1]) + y_pred = model.predict(X_test) + + test_mse = np.mean((y_test - y_pred) ** 2) + self.assertLess(test_mse, 1e-5) + def manually_create_model(equations, feature_names=None): if feature_names is None: From 58b429a2536e21873f9eef72fef5d196e1ba5531 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Tue, 26 Nov 2024 13:10:31 +0000 Subject: [PATCH 45/92] test: clean up TemplateExpressionSpec test --- pysr/test/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/test/test.py b/pysr/test/test.py index 617d0c429..d48658fcb 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -523,7 +523,7 @@ def test_template_sin_addition(self): # Create model with template that includes the missing sin operator model = PySRRegressor( expression_spec=TemplateExpressionSpec( - ["f"], "((; f), (x, y)) -> sin(f(x, y))" + ["f"], "sin_of_f((; f), (x, y)) = sin(f(x, y))" ), binary_operators=["+", "-", "*", "/"], unary_operators=[], # No sin operator! From f433789edb95efd6bd04abb438f273a6808af45d Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Tue, 26 Nov 2024 15:52:27 +0000 Subject: [PATCH 46/92] test: ParametricExpressionSpec pipeline test --- pysr/test/test.py | 55 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/pysr/test/test.py b/pysr/test/test.py index d48658fcb..0904ac90c 100644 --- a/pysr/test/test.py +++ b/pysr/test/test.py @@ -12,7 +12,14 @@ import sympy # type: ignore from sklearn.utils.estimator_checks import check_estimator -from pysr import PySRRegressor, TemplateExpressionSpec, install, jl, load_all_packages +from pysr import ( + ParametricExpressionSpec, + PySRRegressor, + TemplateExpressionSpec, + install, + jl, + load_all_packages, +) from pysr.export_latex import sympy2latex from pysr.feature_selection import _handle_feature_selection, run_feature_selection from pysr.julia_helpers import init_julia @@ -513,7 +520,7 @@ def test_jl_function_error(self): str(cm.exception), ) - def test_template_sin_addition(self): + def test_template_expressions(self): # Create random data between -1 and 1 X = self.rstate.uniform(-1, 1, (100, 2)) @@ -542,6 +549,50 @@ def test_template_sin_addition(self): test_mse = np.mean((y_test - y_pred) ** 2) self.assertLess(test_mse, 1e-5) + def test_parametric_expression(self): + # Create data with two classes + n_points = 100 + X = self.rstate.uniform(-3, 3, (n_points, 2)) # x1, x2 + category = self.rstate.randint(0, 3, n_points) # class (0 or 1) + + # True parameters for each class + P1 = [0.1, 1.5, -5.2] # phase shift for each class + P2 = [3.2, 0.5, 1.2] # offset for each class + + # Ground truth: 2*cos(x2 + P1[class]) + x1^2 - P2[class] + y = np.array( + [ + 2 * np.cos(x2 + P1[c]) + x1**2 - P2[c] + for x1, x2, c in zip(X[:, 0], X[:, 1], category) + ] + ) + + model = PySRRegressor( + expression_spec=ParametricExpressionSpec(max_parameters=2), + binary_operators=["+", "*", "/", "-"], + unary_operators=["cos", "exp"], + maxsize=20, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity <= 14", + **self.default_test_kwargs, + ) + + model.fit(X, y, category=category) + + # Test on new data points + X_test = self.rstate.uniform(-6, 6, (10, 2)) + category_test = self.rstate.randint(0, 3, 10) + + y_test = np.array( + [ + 2 * np.cos(x2 + P1[c]) + x1**2 - P2[c] + for x1, x2, c in zip(X_test[:, 0], X_test[:, 1], category_test) + ] + ) + + y_test_pred = model.predict(X_test, category=category_test) + test_mse = np.mean((y_test - y_test_pred) ** 2) + self.assertLess(test_mse, 1e-3) + def manually_create_model(equations, feature_names=None): if feature_names is None: From 3fd937d21d0424da0f602cbd013fc6a24586e456 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 12:20:18 +0000 Subject: [PATCH 47/92] test: rename to `test_main.py` for pytest compat --- pysr/test/__init__.py | 2 +- pysr/test/{test.py => test_main.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename pysr/test/{test.py => test_main.py} (100%) diff --git a/pysr/test/__init__.py b/pysr/test/__init__.py index cb6b9e4a3..4d977cccf 100644 --- a/pysr/test/__init__.py +++ b/pysr/test/__init__.py @@ -1,7 +1,7 @@ -from .test import runtests from .test_cli import get_runtests as get_runtests_cli from .test_dev import runtests as runtests_dev from .test_jax import runtests as runtests_jax +from .test_main import runtests from .test_startup import runtests as runtests_startup from .test_torch import runtests as runtests_torch diff --git a/pysr/test/test.py b/pysr/test/test_main.py similarity index 100% rename from pysr/test/test.py rename to pysr/test/test_main.py From 959dfdbe4b865bc5dd0abdd2f638a83ac3be3d85 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 12:41:26 +0000 Subject: [PATCH 48/92] deps: require Python >= 3.9 --- .github/workflows/CI.yml | 10 +++++----- .github/workflows/CI_docker_large_nightly.yml | 2 +- .github/workflows/CI_large_nightly.yml | 2 +- environment.yml | 2 +- pyproject.toml | 2 +- pysr/expression_specs.py | 7 ++++++- 6 files changed, 15 insertions(+), 10 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 07ece9dae..a454345ae 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -31,7 +31,7 @@ jobs: test-id: [main] include: - julia-version: '1.10' - python-version: '3.8' + python-version: '3.9' os: ubuntu-latest test-id: include - julia-version: '1' @@ -176,7 +176,7 @@ jobs: matrix: python-version: - '3.12' - - '3.8' + - '3.9' os: ['ubuntu-latest'] steps: @@ -193,10 +193,10 @@ jobs: pip install mypy - name: "Install additional dependencies" run: python -m pip install jax jaxlib torch - if: ${{ matrix.python-version != '3.8' }} + if: ${{ matrix.python-version != '3.9' }} - name: "Run mypy" run: python -m mypy --install-types --non-interactive pysr - if: ${{ matrix.python-version != '3.8' }} + if: ${{ matrix.python-version != '3.9' }} - name: "Run compatible mypy" run: python -m mypy --ignore-missing-imports pysr - if: ${{ matrix.python-version == '3.8' }} + if: ${{ matrix.python-version == '3.9' }} diff --git a/.github/workflows/CI_docker_large_nightly.yml b/.github/workflows/CI_docker_large_nightly.yml index 185383ccf..35d15fbaa 100644 --- a/.github/workflows/CI_docker_large_nightly.yml +++ b/.github/workflows/CI_docker_large_nightly.yml @@ -19,7 +19,7 @@ jobs: fail-fast: false matrix: julia-version: ['1.6', '1'] - python-version: ['3.8', '3.12'] + python-version: ['3.9', '3.12'] os: [ubuntu-latest] arch: ['linux/amd64', 'linux/arm64'] diff --git a/.github/workflows/CI_large_nightly.yml b/.github/workflows/CI_large_nightly.yml index cbd9a7ef3..af00eb8c4 100644 --- a/.github/workflows/CI_large_nightly.yml +++ b/.github/workflows/CI_large_nightly.yml @@ -24,7 +24,7 @@ jobs: fail-fast: false matrix: julia-version: ['1.6', '1.8', '1.10'] - python-version: ['3.8', '3.10', '3.12'] + python-version: ['3.9', '3.10', '3.12'] os: [ubuntu-latest, macos-latest, windows-latest] steps: diff --git a/environment.yml b/environment.yml index c7d6ceebf..f3d6c42de 100644 --- a/environment.yml +++ b/environment.yml @@ -2,7 +2,7 @@ name: test channels: - conda-forge dependencies: - - python>=3.8 + - python>=3.9 - sympy>=1.0.0,<2.0.0 - pandas>=0.21.0,<3.0.0 - numpy>=1.13.0,<2.0.0 diff --git a/pyproject.toml b/pyproject.toml index 62101f0c5..e7f247546 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ description = "Simple and efficient symbolic regression" readme = {file = "README.md", content-type = "text/markdown"} license = {file = "LICENSE"} -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index d60aa2532..20ccf9ff0 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -1,6 +1,11 @@ import copy from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Dict, List, NewType, Optional, TypeAlias +from typing import TYPE_CHECKING, Any, Dict, List, NewType, Optional + +try: + from typing import TypeAlias +except ImportError: + TypeAlias = Any import numpy as np import pandas as pd From 2e75f4ea8ee818f64a8957ad6a954f97caf00824 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 12:42:47 +0000 Subject: [PATCH 49/92] test: require pytest in dev deps --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index e7f247546..c8d181682 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,4 +42,6 @@ dev-dependencies = [ "types-pytz>=2024.1.0.20240417", "types-openpyxl>=3.1.0.20240428", "coverage>=7.5.3", + "pytest>=8.3.3", + "nbval>=0.11.0", ] From 62865b6f55eedaa096d39f3b5f2e025b8f688bcb Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 12:52:46 +0000 Subject: [PATCH 50/92] fix: TypeAlias definition on older Python --- pysr/expression_specs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index 20ccf9ff0..c7ac5327a 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -5,7 +5,7 @@ try: from typing import TypeAlias except ImportError: - TypeAlias = Any + TypeAlias = NewType("TypeAlias", Any) import numpy as np import pandas as pd From 713d3c62bd2a0c71919d34d170b4c6ee526a5de5 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 13:00:46 +0000 Subject: [PATCH 51/92] fix: only convert dtype for Julia evaluations --- pysr/expression_specs.py | 12 ++++++++++++ pysr/sr.py | 5 ++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index c7ac5327a..a4c771897 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -63,6 +63,10 @@ def create_exports( """Create additional columns in the equations dataframe.""" pass + @property + def evaluates_in_julia(self) -> bool: + return False + @property def supports_sympy(self) -> bool: return False @@ -197,6 +201,10 @@ def julia_expression_options(self): ) return creator(self.function_symbols, f_combine, self.num_features) + @property + def evaluates_in_julia(self): + return True + def create_exports( self, model: PySRRegressor, @@ -219,6 +227,10 @@ def julia_expression_type(self): def julia_expression_options(self): return jl.seval("NamedTuple{(:max_parameters,)}")((self.max_parameters,)) + @property + def evaluates_in_julia(self): + return True + def create_exports( self, model: PySRRegressor, diff --git a/pysr/sr.py b/pysr/sr.py index 7fb7da013..e644d8f5a 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2313,7 +2313,10 @@ def predict( # feature selected) X in fit. X = X.reindex(columns=self.feature_names_in_) X = self._validate_data_X(X) - X = X.astype(self._get_precision_mapped_dtype(X)) + if self.expression_spec_.evaluates_in_julia: + # Julia wants the right dtype + X = X.astype(self._get_precision_mapped_dtype(X)) + if category is not None: offset_for_julia_indexing = 1 args: tuple = ( From baa953d2096040b802ddc88582e5ad1dbc5d46ca Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 13:46:58 +0000 Subject: [PATCH 52/92] deps: add back typing-extensions --- environment.yml | 1 + pysr/expression_specs.py | 2 +- requirements.txt | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index f3d6c42de..840b6a1c3 100644 --- a/environment.yml +++ b/environment.yml @@ -9,3 +9,4 @@ dependencies: - scikit-learn>=1.0.0,<2.0.0 - pyjuliacall>=0.9.21,<0.9.22 - click>=7.0.0,<9.0.0 + - typing-extensions>=4.0.0,<5.0.0 diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index a4c771897..858bcf73f 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -5,7 +5,7 @@ try: from typing import TypeAlias except ImportError: - TypeAlias = NewType("TypeAlias", Any) + from typing_extensions import TypeAlias import numpy as np import pandas as pd diff --git a/requirements.txt b/requirements.txt index aa92aaf13..96f4ed9a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ scikit_learn>=1.0.0,<2.0.0 juliacall==0.9.23 click>=7.0.0,<9.0.0 setuptools>=50.0.0 +typing-extensions>=4.0.0,<5.0.0 From b2fe574a9752092982aaad13a9550bab3ceaf859 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 14:14:55 +0000 Subject: [PATCH 53/92] test: fix mypy errors --- pysr/expression_specs.py | 5 +++-- pysr/sr.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index 858bcf73f..473cdad89 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -1,10 +1,11 @@ import copy +import sys from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any, Dict, List, NewType, Optional -try: +if sys.version_info >= (3, 10): from typing import TypeAlias -except ImportError: +else: from typing_extensions import TypeAlias import numpy as np diff --git a/pysr/sr.py b/pysr/sr.py index e644d8f5a..df11cca58 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1071,7 +1071,7 @@ def from_file( assert unary_operators is None assert n_features_in is None with open(pkl_filename, "rb") as f: - model: "pysr.sr.PySRRegressor" = pkl.load(f) + model: "PySRRegressor" = pkl.load(f) # Update any parameters if necessary, such as # extra_sympy_mappings: From c2cbbfc2e281c35d2f45a6eed3b51b4a96632fd4 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 16:11:13 +0000 Subject: [PATCH 54/92] test: improve coverage for new 1.0 code --- pysr/expression_specs.py | 12 +++++------- pysr/sr.py | 6 ++++++ pysr/test/test_main.py | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index 473cdad89..eab85a6ec 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -17,9 +17,9 @@ # For type checking purposes if TYPE_CHECKING: - from .sr import PySRRegressor + from .sr import PySRRegressor # pragma: no cover - PySRRegressor: TypeAlias = PySRRegressor + PySRRegressor: TypeAlias = PySRRegressor # pragma: no cover else: PySRRegressor = NewType("PySRRegressor", Any) @@ -47,12 +47,12 @@ class AbstractExpressionSpec(ABC): @abstractmethod def julia_expression_type(self) -> AnyValue: """The expression type""" - pass + pass # pragma: no cover @abstractmethod def julia_expression_options(self) -> AnyValue: """The expression options""" - pass + pass # pragma: no cover @abstractmethod def create_exports( @@ -62,7 +62,7 @@ def create_exports( search_output: Any, ) -> pd.DataFrame: """Create additional columns in the equations dataframe.""" - pass + pass # pragma: no cover @property def evaluates_in_julia(self) -> bool: @@ -247,8 +247,6 @@ def __init__(self, expression): self.expression = expression def __call__(self, X: np.ndarray, *args): - if not isinstance(X, np.ndarray): - raise ValueError("X must be a numpy array") raw_output = self.expression(jl_array(X.T), *args) return np.array(raw_output).T diff --git a/pysr/sr.py b/pysr/sr.py index df11cca58..0cdc4981f 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2364,6 +2364,10 @@ def sympy(self, index: Optional[Union[int, List[int]]] = None): best_equation : str, list[str] of length nout_ SymPy representation of the best equation. """ + if not self.expression_spec_.supports_sympy: + raise ValueError( + f"`expression_spec={self.expression_spec_}` does not support sympy export." + ) self.refresh() best_equation = self.get_best(index=index) if isinstance(best_equation, list): @@ -2558,6 +2562,8 @@ def get_hof( if should_read_from_file: self.equation_file_contents_ = self._read_equation_file() + _validate_export_mappings(self.extra_jax_mappings, self.extra_torch_mappings) + equation_file_contents = cast(List[pd.DataFrame], self.equation_file_contents_) ret_outputs = [ diff --git a/pysr/test/test_main.py b/pysr/test/test_main.py index 0904ac90c..4f7685ff8 100644 --- a/pysr/test/test_main.py +++ b/pysr/test/test_main.py @@ -549,6 +549,23 @@ def test_template_expressions(self): test_mse = np.mean((y_test - y_pred) ** 2) self.assertLess(test_mse, 1e-5) + # Make sure that a nice error is raised if we try to get the sympy expression: + # f"`expression_spec={self.expression_spec_}` does not support sympy export." + with self.assertRaises(ValueError) as cm: + model.sympy() + self.assertRegex( + str(cm.exception), + r"`expression_spec=.*TemplateExpressionSpec.*` does not support sympy export.", + ) + with self.assertRaises(ValueError): + model.latex() + with self.assertRaises(ValueError): + model.jax() + with self.assertRaises(ValueError): + model.pytorch() + with self.assertRaises(ValueError): + model.latex_table() + def test_parametric_expression(self): # Create data with two classes n_points = 100 @@ -593,6 +610,17 @@ def test_parametric_expression(self): test_mse = np.mean((y_test - y_test_pred) ** 2) self.assertLess(test_mse, 1e-3) + with self.assertRaises(ValueError): + model.sympy() + with self.assertRaises(ValueError): + model.latex() + with self.assertRaises(ValueError): + model.jax() + with self.assertRaises(ValueError): + model.pytorch() + with self.assertRaises(ValueError): + model.latex_table() + def manually_create_model(equations, feature_names=None): if feature_names is None: @@ -841,6 +869,14 @@ def test_deprecation(self): # The correct value should be set: self.assertEqual(model.fraction_replaced, 0.2) + with self.assertRaises(NotImplementedError): + model.equation_file_ + + with self.assertRaises(ValueError) as cm: + PySRRegressor.from_file(equation_file="", run_directory="") + + self.assertIn("Passing `equation_file` is deprecated", str(cm.exception)) + def test_deprecated_functions(self): with self.assertWarns(FutureWarning): install() From 9fabec6540c1aa43cd8566aa72545531982eb7a0 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 17:38:24 +0000 Subject: [PATCH 55/92] feat: get complexity_mapping working --- pysr/sr.py | 8 ++++++-- pysr/test/test_main.py | 15 +++++++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 0cdc4981f..d703ef98c 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -786,7 +786,7 @@ def __init__( elementwise_loss: Optional[str] = None, loss_function: Optional[str] = None, complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None, - complexity_of_constants: Union[int, float] = 1, + complexity_of_constants: Optional[Union[int, float]] = None, complexity_of_variables: Optional[Union[int, float]] = None, complexity_mapping: Optional[str] = None, parsimony: float = 0.0032, @@ -1889,6 +1889,10 @@ def _run( ) output_list.append(jl_op) + complexity_mapping = ( + jl.seval(self.complexity_mapping) if self.complexity_mapping else None + ) + # Call to Julia backend. # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl options = SymbolicRegression.Options( @@ -1899,7 +1903,7 @@ def _run( complexity_of_operators=complexity_of_operators, complexity_of_constants=self.complexity_of_constants, complexity_of_variables=complexity_of_variables, - complexity_mapping=self.complexity_mapping, + complexity_mapping=complexity_mapping, expression_type=self.expression_spec_.julia_expression_type(), expression_options=self.expression_spec_.julia_expression_options(), nested_constraints=nested_constraints, diff --git a/pysr/test/test_main.py b/pysr/test/test_main.py index 4f7685ff8..ac94b8ea7 100644 --- a/pysr/test/test_main.py +++ b/pysr/test/test_main.py @@ -520,7 +520,7 @@ def test_jl_function_error(self): str(cm.exception), ) - def test_template_expressions(self): + def test_template_expressions_and_custom_complexity(self): # Create random data between -1 and 1 X = self.rstate.uniform(-1, 1, (100, 2)) @@ -535,7 +535,9 @@ def test_template_expressions(self): binary_operators=["+", "-", "*", "/"], unary_operators=[], # No sin operator! maxsize=10, - early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 3", + early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 6", + # Custom complexity *function*: + complexity_mapping="my_complexity(ex) = sum(t -> 2, get_tree(ex))", **self.default_test_kwargs, ) @@ -549,6 +551,15 @@ def test_template_expressions(self): test_mse = np.mean((y_test - y_pred) ** 2) self.assertLess(test_mse, 1e-5) + # Check there is a row with complexity 6 and MSE < 1e-10 + df = model.equations_ + good_rows = df[(df.complexity == 6) & (df.loss < 1e-10)] + self.assertGreater(len(good_rows), 0) + + # Check there are NO rows with lower complexity and MSE < 1e-10 + simpler_good_rows = df[(df.complexity < 6) & (df.loss < 1e-10)] + self.assertEqual(len(simpler_good_rows), 0) + # Make sure that a nice error is raised if we try to get the sympy expression: # f"`expression_spec={self.expression_spec_}` does not support sympy export." with self.assertRaises(ValueError) as cm: From 0b4f6d41087346665febd6adb3437b261ea4e69b Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 18:07:08 +0000 Subject: [PATCH 56/92] docs: push to new url --- .github/workflows/docs.yml | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index af36db783..c277815c3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -25,7 +25,7 @@ jobs: - name: "Set up Python" uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: 3.12 cache: pip - name: "Install packages for docs building" run: pip install -r docs/requirements.txt @@ -33,5 +33,18 @@ jobs: run: pip install . && python -c 'import pysr' - name: "Build API docs" run: cd docs && ./gen_docs.sh - - name: "Deploy documentation" + - name: "Deploy documentation to primary repository" run: mkdocs gh-deploy --force + - name: "Deploy documentation to secondary repository" + env: + DEPLOY_KEY: ${{ secrets.DAMTP_DEPLOY_KEY }} + run: | + # Set up SSH key for authentication + mkdir -p ~/.ssh + echo "$DEPLOY_KEY" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan github.com >> ~/.ssh/known_hosts + + git checkout gh-pages + git remote add secondary git@github.com:ai-damtp-cam-ac-uk/pysr.git + git push secondary gh-pages --force From a12c12ecc88dc71e732234617824d3cc633fea44 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 18:14:05 +0000 Subject: [PATCH 57/92] deps: update backend --- pysr/juliapkg.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json index 0ae4955e0..35405a8ed 100644 --- a/pysr/juliapkg.json +++ b/pysr/juliapkg.json @@ -3,7 +3,7 @@ "packages": { "SymbolicRegression": { "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb", - "version": "=1.0.2" + "version": "=1.0.3" }, "Serialization": { "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b", From 238cdfb511ff30d1303509d066860b2fe247e562 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 19:00:49 +0000 Subject: [PATCH 58/92] feat: add missing hyperparams --- pysr/param_groupings.yml | 3 +++ pysr/sr.py | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/pysr/param_groupings.yml b/pysr/param_groupings.yml index bce623a7a..a564d6bb7 100644 --- a/pysr/param_groupings.yml +++ b/pysr/param_groupings.yml @@ -37,6 +37,7 @@ - weight_mutate_constant - weight_mutate_operator - weight_swap_operands + - weight_rotate_tree - weight_randomize - weight_simplify - weight_optimize @@ -44,6 +45,7 @@ - annealing - alpha - perturbation_factor + - probability_negate_constant - skip_mutation_failures - Tournament Selection: - tournament_selection_n @@ -51,6 +53,7 @@ - Constant Optimization: - optimizer_algorithm - optimizer_nrestarts + - optimizer_f_calls_limit - optimize_probability - optimizer_iterations - should_optimize_constants diff --git a/pysr/sr.py b/pysr/sr.py index d703ef98c..84d4349d8 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -472,6 +472,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): weight_swap_operands : float Relative likehood for swapping operands in binary operators. Default is `0.1`. + weight_rotate_tree : float + How often to perform a tree rotation at a random node. + Default is `1.42` weight_randomize : float Relative likelihood for mutation to completely delete and then randomly generate the equation @@ -513,6 +516,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Number of time to restart the constants optimization process with different initial conditions. Default is `2`. + optimizer_f_calls_limit : int + How many function calls to allow during optimization. + Default is `10_000`. optimize_probability : float Probability of optimizing the constants during a single iteration of the evolutionary algorithm. @@ -525,6 +531,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): (perturbation_factor*T + 1). Either multiplied by this or divided by this. Default is `0.076`. + probability_negate_constant : float + Probability of negating a constant in the equation when mutating it. + Default is `0.00743`. tournament_selection_n : int Number of expressions to consider in each tournament. Default is `10`. @@ -808,6 +817,7 @@ def __init__( weight_mutate_constant: float = 0.048, weight_mutate_operator: float = 0.47, weight_swap_operands: float = 0.1, + weight_rotate_tree: float = 1.42, weight_randomize: float = 0.00023, weight_simplify: float = 0.0020, weight_optimize: float = 0.0, @@ -820,9 +830,11 @@ def __init__( should_optimize_constants: bool = True, optimizer_algorithm: Literal["BFGS", "NelderMead"] = "BFGS", optimizer_nrestarts: int = 2, + optimizer_f_calls_limit: Optional[int] = None, optimize_probability: float = 0.14, optimizer_iterations: int = 8, perturbation_factor: float = 0.076, + probability_negate_constant: float = 0.00743, tournament_selection_n: int = 10, tournament_selection_p: float = 0.86, procs: int = cpu_count(), @@ -905,6 +917,7 @@ def __init__( self.weight_mutate_constant = weight_mutate_constant self.weight_mutate_operator = weight_mutate_operator self.weight_swap_operands = weight_swap_operands + self.weight_rotate_tree = weight_rotate_tree self.weight_randomize = weight_randomize self.weight_simplify = weight_simplify self.weight_optimize = weight_optimize @@ -920,9 +933,11 @@ def __init__( self.should_optimize_constants = should_optimize_constants self.optimizer_algorithm = optimizer_algorithm self.optimizer_nrestarts = optimizer_nrestarts + self.optimizer_f_calls_limit = optimizer_f_calls_limit self.optimize_probability = optimize_probability self.optimizer_iterations = optimizer_iterations self.perturbation_factor = perturbation_factor + self.probability_negate_constant = probability_negate_constant # -- Selection parameters self.tournament_selection_n = tournament_selection_n self.tournament_selection_p = tournament_selection_p @@ -1866,6 +1881,7 @@ def _run( mutate_constant=self.weight_mutate_constant, mutate_operator=self.weight_mutate_operator, swap_operands=self.weight_swap_operands, + rotate_tree=self.weight_rotate_tree, add_node=self.weight_add_node, insert_node=self.weight_insert_node, delete_node=self.weight_delete_node, @@ -1945,9 +1961,11 @@ def _run( print_precision=self.print_precision, optimizer_algorithm=self.optimizer_algorithm, optimizer_nrestarts=self.optimizer_nrestarts, + optimizer_f_calls_limit=self.optimizer_f_calls_limit, optimizer_probability=self.optimize_probability, optimizer_iterations=self.optimizer_iterations, perturbation_factor=self.perturbation_factor, + probability_negate_constant=self.probability_negate_constant, annealing=self.annealing, timeout_in_seconds=self.timeout_in_seconds, crossover_probability=self.crossover_probability, From cc0728cea86f5537a500feee9aeff559ca402933 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 19:20:40 +0000 Subject: [PATCH 59/92] feat!: update all hyperparameters --- pysr/sr.py | 96 +++++++++++++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 84d4349d8..aa57bf73e 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -292,18 +292,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Number of iterations of the algorithm to run. The best equations are printed and migrate between populations at the end of each iteration. - Default is `40`. + Default is `100`. populations : int Number of populations running. - Default is `15`. + Default is `31`. population_size : int Number of individuals in each population. - Default is `33`. + Default is `27`. max_evals : int Limits the total number of evaluations of expressions to this number. Default is `None`. maxsize : int - Max complexity of an equation. Default is `20`. + Max complexity of an equation. Default is `30`. maxdepth : int Max depth of an equation. You can use both `maxsize` and `maxdepth`. `maxdepth` is by default not used. @@ -404,7 +404,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Default is `None`. parsimony : float Multiplicative factor for how much to punish complexity. - Default is `0.0032`. + Default is `0.0`. dimensional_constraint_penalty : float Additive penalty for if dimensional analysis of an expression fails. By default, this is `1000.0`. @@ -426,11 +426,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): weight the contribution. If you find that the search is only optimizing the most complex expressions while the simpler expressions remain stagnant, you should increase this value. - Default is `20.0`. + Default is `1040.0`. alpha : float Initial temperature for simulated annealing (requires `annealing` to be `True`). - Default is `0.1`. + Default is `3.17`. annealing : bool Whether to use annealing. Default is `False`. early_stop_condition : float | str @@ -442,46 +442,46 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): ncycles_per_iteration : int Number of total mutations to run, per 10 samples of the population, per iteration. - Default is `550`. + Default is `380`. fraction_replaced : float How much of population to replace with migrating equations from other populations. - Default is `0.000364`. + Default is `0.00036`. fraction_replaced_hof : float How much of population to replace with migrating equations from - hall of fame. Default is `0.035`. + hall of fame. Default is `0.0614`. weight_add_node : float Relative likelihood for mutation to add a node. - Default is `0.79`. + Default is `2.47`. weight_insert_node : float Relative likelihood for mutation to insert a node. - Default is `5.1`. + Default is `0.0112`. weight_delete_node : float Relative likelihood for mutation to delete a node. - Default is `1.7`. + Default is `0.870`. weight_do_nothing : float Relative likelihood for mutation to leave the individual. - Default is `0.21`. + Default is `0.273`. weight_mutate_constant : float Relative likelihood for mutation to change the constant slightly in a random direction. - Default is `0.048`. + Default is `0.0346`. weight_mutate_operator : float Relative likelihood for mutation to swap an operator. - Default is `0.47`. + Default is `0.293`. weight_swap_operands : float Relative likehood for swapping operands in binary operators. - Default is `0.1`. + Default is `0.198`. weight_rotate_tree : float How often to perform a tree rotation at a random node. - Default is `1.42` + Default is `4.26`. weight_randomize : float Relative likelihood for mutation to completely delete and then randomly generate the equation - Default is `0.00023`. + Default is `0.000502`. weight_simplify : float Relative likelihood for mutation to simplify constant parts by evaluation - Default is `0.0020`. + Default is `0.00209`. weight_optimize: float Constant optimization can also be performed as a mutation, in addition to the normal strategy controlled by `optimize_probability` which happens @@ -490,7 +490,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Default is `0.0`. crossover_probability : float Absolute probability of crossover-type genetic operation, instead of a mutation. - Default is `0.066`. + Default is `0.0259`. skip_mutation_failures : bool Whether to skip mutation and crossover failures, rather than simply re-sampling the current member. @@ -530,18 +530,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Constants are perturbed by a max factor of (perturbation_factor*T + 1). Either multiplied by this or divided by this. - Default is `0.076`. + Default is `0.129`. probability_negate_constant : float Probability of negating a constant in the equation when mutating it. Default is `0.00743`. tournament_selection_n : int Number of expressions to consider in each tournament. - Default is `10`. + Default is `15`. tournament_selection_p : float Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss. - Default is `0.86`. + Default is `0.982`. procs : int Number of processes (=number of populations running). Default is `cpu_count()`. @@ -782,11 +782,11 @@ def __init__( binary_operators: Optional[List[str]] = None, unary_operators: Optional[List[str]] = None, expression_spec: Optional[AbstractExpressionSpec] = None, - niterations: int = 40, - populations: int = 15, - population_size: int = 33, + niterations: int = 100, + populations: int = 31, + population_size: int = 27, max_evals: Optional[int] = None, - maxsize: int = 20, + maxsize: int = 30, maxdepth: Optional[int] = None, warmup_maxsize_by: Optional[float] = None, timeout_in_seconds: Optional[float] = None, @@ -798,30 +798,30 @@ def __init__( complexity_of_constants: Optional[Union[int, float]] = None, complexity_of_variables: Optional[Union[int, float]] = None, complexity_mapping: Optional[str] = None, - parsimony: float = 0.0032, + parsimony: float = 0.0, dimensional_constraint_penalty: Optional[float] = None, dimensionless_constants_only: bool = False, use_frequency: bool = True, use_frequency_in_tournament: bool = True, - adaptive_parsimony_scaling: float = 20.0, - alpha: float = 0.1, + adaptive_parsimony_scaling: float = 1040.0, + alpha: float = 3.17, annealing: bool = False, early_stop_condition: Optional[Union[float, str]] = None, - ncycles_per_iteration: int = 550, - fraction_replaced: float = 0.000364, - fraction_replaced_hof: float = 0.035, - weight_add_node: float = 0.79, - weight_insert_node: float = 5.1, - weight_delete_node: float = 1.7, - weight_do_nothing: float = 0.21, - weight_mutate_constant: float = 0.048, - weight_mutate_operator: float = 0.47, - weight_swap_operands: float = 0.1, - weight_rotate_tree: float = 1.42, - weight_randomize: float = 0.00023, - weight_simplify: float = 0.0020, + ncycles_per_iteration: int = 380, + fraction_replaced: float = 0.00036, + fraction_replaced_hof: float = 0.0614, + weight_add_node: float = 2.47, + weight_insert_node: float = 0.0112, + weight_delete_node: float = 0.870, + weight_do_nothing: float = 0.273, + weight_mutate_constant: float = 0.0346, + weight_mutate_operator: float = 0.293, + weight_swap_operands: float = 0.198, + weight_rotate_tree: float = 4.26, + weight_randomize: float = 0.000502, + weight_simplify: float = 0.00209, weight_optimize: float = 0.0, - crossover_probability: float = 0.066, + crossover_probability: float = 0.0259, skip_mutation_failures: bool = True, migration: bool = True, hof_migration: bool = True, @@ -833,10 +833,10 @@ def __init__( optimizer_f_calls_limit: Optional[int] = None, optimize_probability: float = 0.14, optimizer_iterations: int = 8, - perturbation_factor: float = 0.076, + perturbation_factor: float = 0.129, probability_negate_constant: float = 0.00743, - tournament_selection_n: int = 10, - tournament_selection_p: float = 0.86, + tournament_selection_n: int = 15, + tournament_selection_p: float = 0.982, procs: int = cpu_count(), multithreading: Optional[bool] = None, cluster_manager: Optional[ From 878d2da770567e0978b98215c192289db6dc7b3e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 20:21:05 +0000 Subject: [PATCH 60/92] feat!: introduce `parallelism` parameter instead of `multithreading` --- pysr/param_groupings.yml | 2 +- pysr/sr.py | 155 ++++++++++++++++++++++++++++----------- 2 files changed, 114 insertions(+), 43 deletions(-) diff --git a/pysr/param_groupings.yml b/pysr/param_groupings.yml index a564d6bb7..98ebee250 100644 --- a/pysr/param_groupings.yml +++ b/pysr/param_groupings.yml @@ -71,8 +71,8 @@ - timeout_in_seconds - early_stop_condition - Performance and Parallelization: + - parallelism - procs - - multithreading - cluster_manager - heap_size_hint_in_bytes - batching diff --git a/pysr/sr.py b/pysr/sr.py index aa57bf73e..454991dc3 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -236,7 +236,6 @@ class _DynamicallySetParams: unary_operators: List[str] maxdepth: int constraints: Dict[str, Union[int, Tuple[int, int]]] - multithreading: bool batch_size: int update_verbosity: int progress: bool @@ -542,12 +541,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss. Default is `0.982`. - procs : int - Number of processes (=number of populations running). - Default is `cpu_count()`. - multithreading : bool - Use multithreading instead of distributed backend. - Using procs=0 will turn off both. Default is `True`. + parallelism: Optional[Literal["serial", "multithreading", "multiprocessing"]] + Parallelism to use for the search. Can be `"serial"`, `"multithreading"`, or `"multiprocessing"`. + Default is `"multithreading"`. + procs: Optional[int] + Number of processes to use for parallelism. If `None`, defaults to `cpu_count()`. + Default is `None`. cluster_manager : str For distributed computing, this sets the job queue system. Set to one of "slurm", "pbs", "lsf", "sge", "qrsh", "scyld", or @@ -599,7 +598,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): deterministic : bool Make a PySR search give the same result every run. To use this, you must turn off parallelism - (with `procs`=0, `multithreading`=False), + (with `parallelism="serial"`), and set `random_state` to a fixed seed. Default is `False`. warm_start : bool @@ -837,8 +836,10 @@ def __init__( probability_negate_constant: float = 0.00743, tournament_selection_n: int = 15, tournament_selection_p: float = 0.982, - procs: int = cpu_count(), - multithreading: Optional[bool] = None, + parallelism: Optional[ + Literal["serial", "multithreading", "multiprocessing"] + ] = None, + procs: Optional[int] = None, cluster_manager: Optional[ Literal["slurm", "pbs", "lsf", "sge", "qrsh", "scyld", "htc"] ] = None, @@ -870,6 +871,8 @@ def __init__( extra_jax_mappings: Optional[Dict[Callable, str]] = None, denoise: bool = False, select_k_features: Optional[int] = None, + # Deprecated parameters: + multithreading: Optional[bool] = None, **kwargs, ): # Hyperparameters @@ -942,8 +945,8 @@ def __init__( self.tournament_selection_n = tournament_selection_n self.tournament_selection_p = tournament_selection_p # -- Performance parameters + self.parallelism = parallelism self.procs = procs - self.multithreading = multithreading self.cluster_manager = cluster_manager self.heap_size_hint_in_bytes = heap_size_hint_in_bytes self.batching = batching @@ -978,6 +981,9 @@ def __init__( self.denoise = denoise self.select_k_features = select_k_features + # Deprecated but still supported parameters + self.multithreading = multithreading + # Once all valid parameters have been assigned handle the # deprecated kwargs if len(kwargs) > 0: # pragma: no cover @@ -1408,24 +1414,6 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams: elif self.maxsize < 7: raise ValueError("PySR requires a maxsize of at least 7") - if self.deterministic and not ( - self.multithreading in [False, None] - and self.procs == 0 - and self.random_state is not None - ): - raise ValueError( - "To ensure deterministic searches, you must set `random_state` to a seed, " - "`procs` to `0`, and `multithreading` to `False` or `None`." - ) - - if self.random_state is not None and ( - not self.deterministic or self.procs != 0 - ): - warnings.warn( - "Note: Setting `random_state` without also setting `deterministic` " - "to True and `procs` to 0 will result in non-deterministic searches. " - ) - if self.elementwise_loss is not None and self.loss_function is not None: raise ValueError( "You cannot set both `elementwise_loss` and `loss_function`." @@ -1442,7 +1430,6 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams: unary_operators=[], maxdepth=self.maxsize, constraints={}, - multithreading=self.procs != 0 and self.cluster_manager is None, batch_size=1, update_verbosity=int(self.verbosity), progress=self.progress, @@ -1811,7 +1798,21 @@ def _run( if not ALREADY_RAN and runtime_params.update_verbosity != 0: print("Compiling Julia backend...") + parallelism, numprocs = _map_parallelism_params( + self.parallelism, self.procs, self.multithreading + ) + + if self.deterministic and parallelism != "serial": + raise ValueError( + "To ensure deterministic searches, you must set `parallelism='serial'`. " + "Additionally, make sure to set `random_state` to a seed." + ) + if cluster_manager is not None: + if parallelism != "multiprocessing": + raise ValueError( + "To use cluster managers, you must set `parallelism='multiprocessing'`." + ) cluster_manager = _load_cluster_manager(cluster_manager) # TODO(mcranmer): These functions should be part of this class. @@ -2006,17 +2007,6 @@ def _run( else: jl_extra = jl.NamedTuple() - if self.procs == 0 and not runtime_params.multithreading: - parallelism = "serial" - elif runtime_params.multithreading: - parallelism = "multithreading" - else: - parallelism = "multiprocessing" - - cprocs = ( - None if parallelism in ["serial", "multithreading"] else int(self.procs) - ) - if len(y.shape) > 1: # We set these manually so that they respect Python's 0 indexing # (by default Julia will use y1, y2...) @@ -2045,7 +2035,7 @@ def _run( else self.y_units_ ), options=options, - numprocs=cprocs, + numprocs=numprocs, parallelism=parallelism, saved_state=self.julia_state_, return_state=True, @@ -2740,3 +2730,84 @@ def _mutate_parameter(param_name: str, param_value): return False return param_value + + +def _map_parallelism_params( + parallelism: Optional[Literal["serial", "multithreading", "multiprocessing"]], + procs: Optional[int], + multithreading: Optional[bool], +) -> Tuple[Literal["serial", "multithreading", "multiprocessing"], Optional[int]]: + """Map old and new parallelism parameters to the new format. + + Parameters + ---------- + parallelism : str or None + New parallelism parameter. Can be "serial", "multithreading", or "multiprocessing". + procs : int or None + Number of processes parameter. + multithreading : bool or None + Old multithreading parameter. + + Returns + ------- + parallelism : str + Mapped parallelism mode. + procs : int or None + Mapped number of processes. + + Raises + ------ + ValueError + If both old and new parameters are specified, or if invalid combinations are given. + """ + # Check for mixing old and new parameters + using_new = parallelism is not None + using_old = multithreading is not None + + if using_new and using_old: + raise ValueError( + "Cannot mix old and new parallelism parameters. " + "Use either `parallelism` and `numprocs`, or `procs` and `multithreading`." + ) + elif using_old: + warnings.warn( + "The `multithreading: bool` parameter has been deprecated in favor " + "of `parallelism: Literal['multithreading', 'serial', 'multiprocessing']`.\n" + "Previous usage of `multithreading=True` (default) is now `parallelism='multithreading'`; " + "`multithreading=False, procs=0` is now `parallelism='serial'`; and " + "`multithreading=True, procs={int}` is now `parallelism='multiprocessing', procs={int}`." + ) + if multithreading: + _parallelism = "multithreading" + _procs = None + elif procs is not None and procs > 0: + _parallelism = "multiprocessing" + _procs = procs + else: + _parallelism = "serial" + _procs = None + elif using_new: + _parallelism = parallelism + _procs = procs + else: + _parallelism = "multithreading" + _procs = None + + if _parallelism not in {"serial", "multithreading", "multiprocessing"}: + raise ValueError( + "`parallelism` must be one of 'serial', 'multithreading', or 'multiprocessing'" + ) + elif _parallelism == "serial" and _procs is not None: + warnings.warn( + "`numprocs` is specified but will be ignored since `parallelism='serial'`" + ) + _procs = None + elif parallelism == "multithreading" and _procs is not None: + warnings.warn( + "`numprocs` is specified but will be ignored since `parallelism='multithreading'`" + ) + _procs = None + elif parallelism == "multiprocessing" and _procs is None: + _procs = cpu_count() + + return _parallelism, _procs From 427f09246363ec277dd8eff1402f2b530e682041 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 28 Nov 2024 20:48:58 +0000 Subject: [PATCH 61/92] test: update test for `random_state` test --- pysr/sr.py | 17 +++++++++++------ pysr/test/test_main.py | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 454991dc3..fcbbb150e 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -871,8 +871,6 @@ def __init__( extra_jax_mappings: Optional[Dict[Callable, str]] = None, denoise: bool = False, select_k_features: Optional[int] = None, - # Deprecated parameters: - multithreading: Optional[bool] = None, **kwargs, ): # Hyperparameters @@ -981,9 +979,6 @@ def __init__( self.denoise = denoise self.select_k_features = select_k_features - # Deprecated but still supported parameters - self.multithreading = multithreading - # Once all valid parameters have been assigned handle the # deprecated kwargs if len(kwargs) > 0: # pragma: no cover @@ -997,6 +992,9 @@ def __init__( "Please use that instead.", FutureWarning, ) + elif k == "multithreading": + # Specific advice given in `_map_parallelism_params` + self.multithreading: Optional[bool] = v # Handle kwargs that have been moved to the fit method elif k in ["weights", "variable_names", "Xresampled"]: warnings.warn( @@ -1799,7 +1797,7 @@ def _run( print("Compiling Julia backend...") parallelism, numprocs = _map_parallelism_params( - self.parallelism, self.procs, self.multithreading + self.parallelism, self.procs, getattr(self, "multithreading", None) ) if self.deterministic and parallelism != "serial": @@ -1807,6 +1805,13 @@ def _run( "To ensure deterministic searches, you must set `parallelism='serial'`. " "Additionally, make sure to set `random_state` to a seed." ) + if self.random_state is not None and ( + parallelism != "serial" or not self.deterministic + ): + warnings.warn( + "Note: Setting `random_state` without also setting `deterministic=True` " + "and `parallelism='serial'` will result in non-deterministic searches." + ) if cluster_manager is not None: if parallelism != "multiprocessing": diff --git a/pysr/test/test_main.py b/pysr/test/test_main.py index ac94b8ea7..684dcbb41 100644 --- a/pysr/test/test_main.py +++ b/pysr/test/test_main.py @@ -935,7 +935,7 @@ def test_deterministic_warnings(self): warnings.simplefilter("error") with self.assertRaises(Exception) as context: model.fit(X, y) - self.assertIn("`deterministic`", str(context.exception)) + self.assertIn("`deterministic=True`", str(context.exception)) def test_deterministic_errors(self): """Setting deterministic without random_state should error""" From e07ce3d94bcc0372ecd454cc8ba29994d9e22b45 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 29 Nov 2024 11:20:43 +0000 Subject: [PATCH 62/92] test: update tests to new parallelism syntax --- pysr/test/test_jax.py | 3 +-- pysr/test/test_main.py | 11 ++++------- pysr/test/test_nb.ipynb | 2 +- pysr/test/test_torch.py | 3 +-- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/pysr/test/test_jax.py b/pysr/test/test_jax.py index 827cb9817..98e915315 100644 --- a/pysr/test/test_jax.py +++ b/pysr/test/test_jax.py @@ -138,8 +138,7 @@ def cos_approx(x): }, random_state=0, deterministic=True, - procs=0, - multithreading=False, + parallelism="serial", ) np.random.seed(0) model.fit(X.values, y.values) diff --git a/pysr/test/test_main.py b/pysr/test/test_main.py index 684dcbb41..a6f0b7801 100644 --- a/pysr/test/test_main.py +++ b/pysr/test/test_main.py @@ -100,7 +100,7 @@ def test_multiprocessing_turbo_custom_objective(self): # Turbo needs to work with unsafe operators: unary_operators=["sqrt"], procs=2, - multithreading=False, + parallelism="multiprocessing", turbo=True, early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 1", loss_function=""" @@ -763,8 +763,7 @@ def test_pickle_with_temp_equation_file(self): model = PySRRegressor( populations=int(1 + DEFAULT_POPULATIONS / 5), temp_equation_file=True, - procs=0, - multithreading=False, + parallelism="serial", ) nout = 3 X = np.random.randn(100, 2) @@ -807,8 +806,7 @@ def test_scikit_learn_compatibility(self): progress=False, random_state=0, deterministic=True, # Deterministic as tests require this. - procs=0, - multithreading=False, + parallelism="serial", warm_start=False, temp_equation_file=True, ) # Return early. @@ -1347,9 +1345,8 @@ def test_unit_propagation(self): complexity_of_constants=10, weight_mutate_constant=0.0, should_optimize_constants=False, - multithreading=False, + parallelism="serial", deterministic=True, - procs=0, random_state=0, output_directory=output_dir, run_id=run_id, diff --git a/pysr/test/test_nb.ipynb b/pysr/test/test_nb.ipynb index 1cd394fff..ea0084ad7 100644 --- a/pysr/test/test_nb.ipynb +++ b/pysr/test/test_nb.ipynb @@ -122,7 +122,7 @@ "X = np.random.randn(10, 2)\n", "y = np.random.randn(10)\n", "\n", - "model = PySRRegressor(deterministic=True, multithreading=False, procs=0, random_state=0, verbosity=0, progress=False, niterations=1, ncycles_per_iteration=1)\n", + "model = PySRRegressor(deterministic=True, parallelism=\"serial\", random_state=0, verbosity=0, progress=False, niterations=1, ncycles_per_iteration=1)\n", "str(model)" ] }, diff --git a/pysr/test/test_torch.py b/pysr/test/test_torch.py index 50f5285c9..512902b24 100644 --- a/pysr/test/test_torch.py +++ b/pysr/test/test_torch.py @@ -207,8 +207,7 @@ def cos_approx(x): extra_torch_mappings={"cos_approx": cos_approx}, random_state=0, deterministic=True, - procs=0, - multithreading=False, + parallelism="serial", ) np.random.seed(0) model.fit(X.values, y.values) From 1e432e52a653ac03b908702b705afe666f217a84 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 29 Nov 2024 11:29:25 +0000 Subject: [PATCH 63/92] docs: avoid mentioning procs --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 70d755cfb..a96405695 100644 --- a/README.md +++ b/README.md @@ -267,9 +267,8 @@ For details on what each parameter does, check out the [API page](https://ai.dam ```python model = PySRRegressor( - procs=4, populations=8, - # ^ 2 populations per core, so one is always running. + # ^ Assuming we have 4 cores, this means 2 populations per core, so one is always running. population_size=50, # ^ Slightly larger populations, for greater diversity. ncycles_per_iteration=500, From 52ef6767c6d42111643a05cc1396f64b2faacdb6 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 29 Nov 2024 11:35:56 +0000 Subject: [PATCH 64/92] test: help mypy --- pysr/sr.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index fcbbb150e..52870d9ca 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2783,7 +2783,9 @@ def _map_parallelism_params( "`multithreading=True, procs={int}` is now `parallelism='multiprocessing', procs={int}`." ) if multithreading: - _parallelism = "multithreading" + _parallelism: Literal["multithreading", "multiprocessing", "serial"] = ( + "multithreading" + ) _procs = None elif procs is not None and procs > 0: _parallelism = "multiprocessing" @@ -2792,7 +2794,9 @@ def _map_parallelism_params( _parallelism = "serial" _procs = None elif using_new: - _parallelism = parallelism + _parallelism = cast( + Literal["serial", "multithreading", "multiprocessing"], parallelism + ) _procs = procs else: _parallelism = "multithreading" From 94ef0bcd66b8a36bbd70861aad58100ceac0b338 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 29 Nov 2024 11:48:57 +0000 Subject: [PATCH 65/92] docs: suggest maxsize=20 in README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a96405695..299f019a9 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,7 @@ PySR's main interface is in the style of scikit-learn: from pysr import PySRRegressor model = PySRRegressor( + maxsize=20, niterations=40, # < Increase me for better results binary_operators=["+", "*"], unary_operators=[ From 88964b213a898441f43f98c13542547a614f1cc7 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 29 Nov 2024 20:21:29 +0000 Subject: [PATCH 66/92] chore: move dependencies to pyproject.toml --- pyproject.toml | 11 ++++++++++- requirements.txt | 8 -------- 2 files changed, 10 insertions(+), 9 deletions(-) delete mode 100644 requirements.txt diff --git a/pyproject.toml b/pyproject.toml index c8d181682..724e5c0e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,16 @@ classifiers = [ "Operating System :: OS Independent", "License :: OSI Approved :: Apache Software License" ] -dynamic = ["dependencies"] +dependencies = [ + "sympy>=1.0.0,<2.0.0", + "pandas>=0.21.0,<3.0.0", + "numpy>=1.13.0,<3.0.0", + "scikit_learn>=1.0.0,<2.0.0", + "juliacall==0.9.23", + "click>=7.0.0,<9.0.0", + "setuptools>=50.0.0", + "typing-extensions>=4.0.0,<5.0.0", +] [tool.setuptools] packages = ["pysr", "pysr._cli", "pysr.test"] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 96f4ed9a9..000000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -sympy>=1.0.0,<2.0.0 -pandas>=0.21.0,<3.0.0 -numpy>=1.13.0,<3.0.0 -scikit_learn>=1.0.0,<2.0.0 -juliacall==0.9.23 -click>=7.0.0,<9.0.0 -setuptools>=50.0.0 -typing-extensions>=4.0.0,<5.0.0 From 2df313bec00174ddca440adbf34031db11400711 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 29 Nov 2024 20:36:21 +0000 Subject: [PATCH 67/92] deps: create optional deps under `dev` --- .github/workflows/CI.yml | 18 +++---------- .github/workflows/CI_Windows.yml | 3 +-- .github/workflows/CI_large_nightly.yml | 3 +-- .github/workflows/CI_mac.yml | 7 +----- pyproject.toml | 35 ++++++++++++++------------ 5 files changed, 25 insertions(+), 41 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index a454345ae..9d1428541 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -58,26 +58,18 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install . + pip install '.[dev]' python -c 'import pysr' - name: "Assert Julia version" if: ${{ matrix.julia-version != '1'}} run: python3 -c "from pysr import jl; assert jl.VERSION.major == jl.seval('v\"${{ matrix.julia-version }}\"').major; assert jl.VERSION.minor == jl.seval('v\"${{ matrix.julia-version }}\"').minor" - - name: "Install test dependencies" - run: pip install coverage coveralls pytest nbval - name: "Set up coverage for subprocesses" run: echo 'import coverage; coverage.process_startup()' > "${{ github.workspace }}/sitecustomize.py" - name: "Run tests" run: coverage run -m pysr test main,cli,startup - - name: "Install JAX" - run: pip install jax jaxlib # (optional import) - if: ${{ matrix.test-id == 'main' }} - name: "Run JAX tests" run: coverage run --append -m pysr test jax if: ${{ matrix.test-id == 'main' }} - - name: "Install Torch" - run: pip install torch # (optional import) - if: ${{ matrix.test-id == 'main' }} - name: "Run Torch tests" run: coverage run --append -m pysr test torch if: ${{ matrix.test-id == 'main' }} @@ -105,7 +97,7 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install . + pip install '.[dev]' - name: "Run development test" run: PYSR_TEST_JULIA_VERSION=${{ matrix.julia-version }} PYSR_TEST_PYTHON_VERSION=${{ matrix.python-version }} python -m pysr test dev @@ -189,11 +181,7 @@ jobs: - name: "Install PySR and all dependencies" run: | python -m pip install --upgrade pip - pip install . - pip install mypy - - name: "Install additional dependencies" - run: python -m pip install jax jaxlib torch - if: ${{ matrix.python-version != '3.9' }} + pip install '.[dev]' - name: "Run mypy" run: python -m mypy --install-types --non-interactive pysr if: ${{ matrix.python-version != '3.9' }} diff --git a/.github/workflows/CI_Windows.yml b/.github/workflows/CI_Windows.yml index f66644342..a62b7af6e 100644 --- a/.github/workflows/CI_Windows.yml +++ b/.github/workflows/CI_Windows.yml @@ -46,8 +46,7 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install pytest nbval "numpy<2.0.0" - pip install . + pip install '.[dev]' python -c 'import pysr' - name: "Run tests" run: | diff --git a/.github/workflows/CI_large_nightly.yml b/.github/workflows/CI_large_nightly.yml index af00eb8c4..369d4bf47 100644 --- a/.github/workflows/CI_large_nightly.yml +++ b/.github/workflows/CI_large_nightly.yml @@ -40,8 +40,7 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install pytest nbval - pip install . + pip install '.[dev]' python -c 'import pysr' - name: "Assert Julia version" if: ${{ matrix.julia-version != '1'}} diff --git a/.github/workflows/CI_mac.yml b/.github/workflows/CI_mac.yml index fab20425c..68a940ee3 100644 --- a/.github/workflows/CI_mac.yml +++ b/.github/workflows/CI_mac.yml @@ -46,17 +46,12 @@ jobs: - name: "Install PySR" run: | python -m pip install --upgrade pip - pip install pytest nbval - pip install . + pip install '.[dev]' python -c 'import pysr' - name: "Run tests" run: | python -m pysr test main,cli,startup - - name: "Install JAX" - run: pip install jax jaxlib # (optional import) - name: "Run JAX tests" run: python -m pysr test jax - - name: "Install Torch" - run: pip install torch # (optional import) - name: "Run Torch tests" run: python -m pysr test torch diff --git a/pyproject.toml b/pyproject.toml index 724e5c0e6..42518283a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,25 @@ dependencies = [ "typing-extensions>=4.0.0,<5.0.0", ] +[project.optional-dependencies] +dev = [ + "beartype>=0.19,<0.20", + "coverage>=7,<8", + "coveralls>=4,<5", + "ipykernel>=6,<7", + "ipython>=8,<9", + "jax[cpu]>=0.4,<0.5", + "jupyter>=1,<2", + "mypy>=1,<2", + "nbval>=0.11,<0.12", + "pandas-stubs", + "pre-commit>=3.7,<5", + "pytest>=8,<9", + "torch>=2,<3", + "types-openpyxl", + "types-pytz", +] + [tool.setuptools] packages = ["pysr", "pysr._cli", "pysr.test"] include-package-data = false @@ -38,19 +57,3 @@ dependencies = {file = "requirements.txt"} [tool.isort] profile = "black" - -[tool.rye] -dev-dependencies = [ - "pre-commit>=3.7.0", - "ipython>=8.23.0", - "ipykernel>=6.29.4", - "mypy>=1.10.0", - "jax[cpu]>=0.4.26", - "torch>=2.3.0", - "pandas-stubs>=2.2.1.240316", - "types-pytz>=2024.1.0.20240417", - "types-openpyxl>=3.1.0.20240428", - "coverage>=7.5.3", - "pytest>=8.3.3", - "nbval>=0.11.0", -] From d1927bf5926299c17bf084ae5f517422d6ca268e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 29 Nov 2024 21:00:16 +0000 Subject: [PATCH 68/92] ci: enable beartype checking --- .github/workflows/CI.yml | 1 + pyproject.toml | 1 + pysr/__init__.py | 7 +++++++ 3 files changed, 9 insertions(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 9d1428541..842d833dc 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -20,6 +20,7 @@ jobs: timeout-minutes: 60 env: COVERAGE_PROCESS_START: "${{ github.workspace }}/.coveragerc" + PYSR_USE_BEARTYPE: "1" defaults: run: shell: bash diff --git a/pyproject.toml b/pyproject.toml index 42518283a..c9ed895f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ dev = [ "torch>=2,<3", "types-openpyxl", "types-pytz", + "pytest-cov>=5,<7", ] [tool.setuptools] diff --git a/pysr/__init__.py b/pysr/__init__.py index 0f4497bfb..4a13af3f3 100644 --- a/pysr/__init__.py +++ b/pysr/__init__.py @@ -1,3 +1,10 @@ +import os + +if os.environ.get("PYSR_USE_BEARTYPE", "0") == "1": + from beartype.claw import beartype_this_package + + beartype_this_package() + # This must be imported as early as possible to prevent # library linking issues caused by numpy/pytorch/etc. importing # old libraries: From aa8fda8a964df89a8c0d6475e555cfb9913acb3c Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 29 Nov 2024 22:33:03 +0000 Subject: [PATCH 69/92] refactor: incorporate PEP 585 --- pysr/denoising.py | 14 ++- pysr/export.py | 17 ++- pysr/export_latex.py | 18 +-- pysr/export_numpy.py | 7 +- pysr/export_sympy.py | 17 ++- pysr/expression_specs.py | 16 +-- pysr/feature_selection.py | 8 +- pysr/julia_extensions.py | 4 +- pysr/julia_helpers.py | 8 +- pysr/sr.py | 243 ++++++++++++++++++-------------------- pysr/utils.py | 24 ++-- 11 files changed, 200 insertions(+), 176 deletions(-) diff --git a/pysr/denoising.py b/pysr/denoising.py index 5ab6a168e..fbf17468b 100644 --- a/pysr/denoising.py +++ b/pysr/denoising.py @@ -1,6 +1,8 @@ """Functions for denoising data during preprocessing.""" -from typing import Optional, Tuple, cast +from __future__ import annotations + +from typing import cast import numpy as np from numpy import ndarray @@ -9,9 +11,9 @@ def denoise( X: ndarray, y: ndarray, - Xresampled: Optional[ndarray] = None, - random_state: Optional[np.random.RandomState] = None, -) -> Tuple[ndarray, ndarray]: + Xresampled: ndarray | None = None, + random_state: np.random.RandomState | None = None, +) -> tuple[ndarray, ndarray]: """Denoise the dataset using a Gaussian process.""" from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel @@ -31,8 +33,8 @@ def denoise( def multi_denoise( X: ndarray, y: ndarray, - Xresampled: Optional[ndarray] = None, - random_state: Optional[np.random.RandomState] = None, + Xresampled: ndarray | None = None, + random_state: np.random.RandomState | None = None, ): """Perform `denoise` along each column of `y` independently.""" y = np.stack( diff --git a/pysr/export.py b/pysr/export.py index b79bbacd7..bc88b7e15 100644 --- a/pysr/export.py +++ b/pysr/export.py @@ -1,5 +1,12 @@ +from __future__ import annotations + import copy -from typing import Callable, Dict, Optional, Union +import sys + +if sys.version_info >= (3, 10): + from collections.abc import Callable +else: + from typing import Callable import numpy as np import pandas as pd @@ -16,11 +23,11 @@ def add_export_formats( output: pd.DataFrame, *, feature_names_in: ArrayLike[str], - selection_mask: Union[NDArray[np.bool_], None] = None, - extra_sympy_mappings: Optional[Dict[str, Callable]] = None, - extra_torch_mappings: Optional[Dict[Callable, Callable]] = None, + selection_mask: NDArray[np.bool_] | None = None, + extra_sympy_mappings: dict[str, Callable] | None = None, + extra_torch_mappings: dict[Callable, Callable] | None = None, output_torch_format: bool = False, - extra_jax_mappings: Optional[Dict[Callable, str]] = None, + extra_jax_mappings: dict[Callable, str] | None = None, output_jax_format: bool = False, ) -> pd.DataFrame: """Create export formats for an equations dataframe. diff --git a/pysr/export_latex.py b/pysr/export_latex.py index b7815d07c..f00fc48ad 100644 --- a/pysr/export_latex.py +++ b/pysr/export_latex.py @@ -1,6 +1,6 @@ """Functions to help export PySR equations to LaTeX.""" -from typing import List, Optional, Tuple +from __future__ import annotations import pandas as pd import sympy # type: ignore @@ -28,8 +28,8 @@ def sympy2latex(expr, prec=3, full_prec=True, **settings) -> str: def generate_table_environment( - columns: List[str] = ["equation", "complexity", "loss"] -) -> Tuple[str, str]: + columns: list[str] = ["equation", "complexity", "loss"] +) -> tuple[str, str]: margins = "c" * len(columns) column_map = { "complexity": "Complexity", @@ -61,9 +61,9 @@ def generate_table_environment( def sympy2latextable( equations: pd.DataFrame, - indices: Optional[List[int]] = None, + indices: list[int] | None = None, precision: int = 3, - columns: List[str] = ["equation", "complexity", "loss", "score"], + columns: list[str] = ["equation", "complexity", "loss", "score"], max_equation_length: int = 50, output_variable_name: str = "y", ) -> str: @@ -128,11 +128,11 @@ def sympy2latextable( def sympy2multilatextable( - equations: List[pd.DataFrame], - indices: Optional[List[List[int]]] = None, + equations: list[pd.DataFrame], + indices: list[list[int]] | None = None, precision: int = 3, - columns: List[str] = ["equation", "complexity", "loss", "score"], - output_variable_names: Optional[List[str]] = None, + columns: list[str] = ["equation", "complexity", "loss", "score"], + output_variable_names: list[str] | None = None, ) -> str: """Generate multiple latex tables for a list of equation sets.""" # TODO: Let user specify custom output variable diff --git a/pysr/export_numpy.py b/pysr/export_numpy.py index 4c1d12e73..c67834985 100644 --- a/pysr/export_numpy.py +++ b/pysr/export_numpy.py @@ -1,7 +1,8 @@ """Code for exporting discovered expressions to numpy""" +from __future__ import annotations + import warnings -from typing import List, Union import numpy as np import pandas as pd @@ -17,8 +18,8 @@ class CallableEquation: """Simple wrapper for numpy lambda functions built with sympy""" _sympy: Expr - _sympy_symbols: List[Symbol] - _selection: Union[NDArray[np.bool_], None] + _sympy_symbols: list[Symbol] + _selection: NDArray[np.bool_] | None def __init__(self, eqn, sympy_symbols, selection=None): self._sympy = eqn diff --git a/pysr/export_sympy.py b/pysr/export_sympy.py index f38593413..6f8662059 100644 --- a/pysr/export_sympy.py +++ b/pysr/export_sympy.py @@ -1,6 +1,13 @@ """Define utilities to export to sympy""" -from typing import Callable, Dict, List, Optional +from __future__ import annotations + +import sys + +if sys.version_info >= (3, 10): + from collections.abc import Callable +else: + from typing import Callable import sympy # type: ignore from sympy import sympify @@ -63,21 +70,21 @@ def create_sympy_symbols_map( feature_names_in: ArrayLike[str], -) -> Dict[str, sympy.Symbol]: +) -> dict[str, sympy.Symbol]: return {variable: sympy.Symbol(variable) for variable in feature_names_in} def create_sympy_symbols( feature_names_in: ArrayLike[str], -) -> List[sympy.Symbol]: +) -> list[sympy.Symbol]: return [sympy.Symbol(variable) for variable in feature_names_in] def pysr2sympy( equation: str, *, - feature_names_in: Optional[ArrayLike[str]] = None, - extra_sympy_mappings: Optional[Dict[str, Callable]] = None, + feature_names_in: ArrayLike[str] | None = None, + extra_sympy_mappings: dict[str, Callable] | None = None, ): if feature_names_in is None: feature_names_in = [] diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index eab85a6ec..b7103d357 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -1,7 +1,9 @@ +from __future__ import annotations + import copy import sys from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Dict, List, NewType, Optional +from typing import TYPE_CHECKING, Any, NewType if sys.version_info >= (3, 10): from typing import TypeAlias @@ -59,7 +61,7 @@ def create_exports( self, model: PySRRegressor, equations: pd.DataFrame, - search_output: Any, + search_output, ) -> pd.DataFrame: """Create additional columns in the equations dataframe.""" pass # pragma: no cover @@ -98,7 +100,7 @@ def create_exports( self, model: PySRRegressor, equations: pd.DataFrame, - search_output: Any, + search_output, ): return add_export_formats( equations, @@ -169,9 +171,9 @@ class TemplateExpressionSpec(AbstractExpressionSpec): def __init__( self, - function_symbols: List[str], + function_symbols: list[str], combine: str, - num_features: Optional[Dict[str, int]] = None, + num_features: dict[str, int] | None = None, ): self.function_symbols = function_symbols self.combine = combine @@ -210,7 +212,7 @@ def create_exports( self, model: PySRRegressor, equations: pd.DataFrame, - search_output: Any, + search_output, ) -> pd.DataFrame: # We try to load the raw julia state from a saved binary stream # if not provided. @@ -236,7 +238,7 @@ def create_exports( self, model: PySRRegressor, equations: pd.DataFrame, - search_output: Any, + search_output, ): search_output = search_output or model.julia_state_ return _search_output_to_callable_expressions(equations, search_output) diff --git a/pysr/feature_selection.py b/pysr/feature_selection.py index 7702e255a..a0e4d2072 100644 --- a/pysr/feature_selection.py +++ b/pysr/feature_selection.py @@ -1,6 +1,8 @@ """Functions for doing feature selection during preprocessing.""" -from typing import Optional, cast +from __future__ import annotations + +from typing import cast import numpy as np from numpy import ndarray @@ -13,7 +15,7 @@ def run_feature_selection( X: ndarray, y: ndarray, select_k_features: int, - random_state: Optional[np.random.RandomState] = None, + random_state: np.random.RandomState | None = None, ) -> NDArray[np.bool_]: """ Find most important features. @@ -38,7 +40,7 @@ def run_feature_selection( # Function has not been removed only due to usage in module tests def _handle_feature_selection( X: ndarray, - select_k_features: Optional[int], + select_k_features: int | None, y: ndarray, variable_names: ArrayLike[str], ): diff --git a/pysr/julia_extensions.py b/pysr/julia_extensions.py index ac4714d48..54c01c6a3 100644 --- a/pysr/julia_extensions.py +++ b/pysr/julia_extensions.py @@ -1,6 +1,6 @@ """This file installs and loads extensions for SymbolicRegression.""" -from typing import Optional +from __future__ import annotations from .julia_import import Pkg, jl @@ -10,7 +10,7 @@ def load_required_packages( turbo: bool = False, bumper: bool = False, enable_autodiff: bool = False, - cluster_manager: Optional[str] = None, + cluster_manager: str | None = None, ): if turbo: load_package("LoopVectorization", "bdcacae8-1622-11e9-2a5c-532679323890") diff --git a/pysr/julia_helpers.py b/pysr/julia_helpers.py index a93b6265d..917c4fd3d 100644 --- a/pysr/julia_helpers.py +++ b/pysr/julia_helpers.py @@ -1,6 +1,12 @@ """Functions for initializing the Julia environment and installing deps.""" -from typing import Any, Callable, cast, overload +import sys +from typing import Any, cast, overload + +if sys.version_info >= (3, 10): + from typing import Callable +else: + from collections.abc import Callable import numpy as np from juliacall import convert as jl_convert # type: ignore diff --git a/pysr/sr.py b/pysr/sr.py index 52870d9ca..6dc5169fe 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1,5 +1,7 @@ """Define the PySRRegressor scikit-learn interface.""" +from __future__ import annotations + import copy import os import pickle as pkl @@ -11,18 +13,12 @@ from io import StringIO from multiprocessing import cpu_count from pathlib import Path -from typing import ( - Any, - Callable, - Dict, - List, - Literal, - Optional, - Tuple, - Type, - Union, - cast, -) +from typing import Any, Literal, cast + +if sys.version_info >= (3, 10): + from collections.abc import Callable +else: + from typing import Callable import numpy as np import pandas as pd @@ -72,10 +68,10 @@ def _process_constraints( - binary_operators: List[str], - unary_operators: List[Union[Any, str]], - constraints: Dict[str, Union[int, Tuple[int, int]]], -) -> Dict[str, Union[int, Tuple[int, int]]]: + binary_operators: list[str], + unary_operators: list, + constraints: dict[str, int | tuple[int, int]], +) -> dict[str, int | tuple[int, int]]: constraints = constraints.copy() for op in unary_operators: if op not in constraints: @@ -94,7 +90,7 @@ def _process_constraints( ) constraints[op] = (-1, -1) - constraint_tuple = cast(Tuple[int, int], constraints[op]) + constraint_tuple = cast(tuple[int, int], constraints[op]) if op in ["plus", "sub", "+", "-"]: if constraint_tuple[0] != constraint_tuple[1]: raise NotImplementedError( @@ -111,10 +107,10 @@ def _process_constraints( def _maybe_create_inline_operators( - binary_operators: List[str], - unary_operators: List[str], - extra_sympy_mappings: Optional[Dict[str, Callable]], -) -> Tuple[List[str], List[str]]: + binary_operators: list[str], + unary_operators: list[str], + extra_sympy_mappings: dict[str, Callable] | None, +) -> tuple[list[str], list[str]]: binary_operators = binary_operators.copy() unary_operators = unary_operators.copy() for op_list in [binary_operators, unary_operators]: @@ -232,10 +228,10 @@ def _validate_export_mappings(extra_jax_mappings, extra_torch_mappings): class _DynamicallySetParams: """Defines some parameters that are set at runtime.""" - binary_operators: List[str] - unary_operators: List[str] + binary_operators: list[str] + unary_operators: list[str] maxdepth: int - constraints: Dict[str, Union[int, Tuple[int, int]]] + constraints: dict[str, int | tuple[int, int]] batch_size: int update_verbosity: int progress: bool @@ -379,7 +375,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): `idx` argument to the function, which is `nothing` for non-batched, and a 1D array of indices for batched. Default is `None`. - complexity_of_operators : dict[str, Union[int, float]] + complexity_of_operators : dict[str, int | float] If you would like to use a complexity other than 1 for an operator, specify the complexity here. For example, `{"sin": 2, "+": 1}` would give a complexity of 2 for each use @@ -541,10 +537,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss. Default is `0.982`. - parallelism: Optional[Literal["serial", "multithreading", "multiprocessing"]] + parallelism: Literal["serial", "multithreading", "multiprocessing"] | None Parallelism to use for the search. Can be `"serial"`, `"multithreading"`, or `"multiprocessing"`. Default is `"multithreading"`. - procs: Optional[int] + procs: int | None Number of processes to use for parallelism. If `None`, defaults to `cpu_count()`. Default is `None`. cluster_manager : str @@ -702,7 +698,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Number of output dimensions. selection_mask_ : ndarray of shape (`n_features_in_`,) Mask of which features of `X` to use when `select_k_features` is set. - tempdir_ : Optional[Path] + tempdir_ : Path | None Path to the temporary equations directory. julia_state_stream_ : ndarray The serialized state for the julia SymbolicRegression.jl backend (after fitting), @@ -758,54 +754,54 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): ``` """ - equations_: Union[pd.DataFrame, List[pd.DataFrame], None] + equations_: pd.DataFrame | list[pd.DataFrame] | None n_features_in_: int feature_names_in_: ArrayLike[str] display_feature_names_in_: ArrayLike[str] - complexity_of_variables_: Union[int, float, List[Union[int, float]], None] - X_units_: Union[ArrayLike[str], None] - y_units_: Union[str, ArrayLike[str], None] + complexity_of_variables_: int | float | list[int | float] | None + X_units_: ArrayLike[str] | None + y_units_: str | ArrayLike[str] | None nout_: int - selection_mask_: Union[NDArray[np.bool_], None] + selection_mask_: NDArray[np.bool_] | None run_id_: str output_directory_: str - julia_state_stream_: Union[NDArray[np.uint8], None] - julia_options_stream_: Union[NDArray[np.uint8], None] - equation_file_contents_: Union[List[pd.DataFrame], None] + julia_state_stream_: NDArray[np.uint8] | None + julia_options_stream_: NDArray[np.uint8] | None + equation_file_contents_: list[pd.DataFrame] | None show_pickle_warnings_: bool def __init__( self, model_selection: Literal["best", "accuracy", "score"] = "best", *, - binary_operators: Optional[List[str]] = None, - unary_operators: Optional[List[str]] = None, - expression_spec: Optional[AbstractExpressionSpec] = None, + binary_operators: list[str] | None = None, + unary_operators: list[str] | None = None, + expression_spec: AbstractExpressionSpec | None = None, niterations: int = 100, populations: int = 31, population_size: int = 27, - max_evals: Optional[int] = None, + max_evals: int | None = None, maxsize: int = 30, - maxdepth: Optional[int] = None, - warmup_maxsize_by: Optional[float] = None, - timeout_in_seconds: Optional[float] = None, - constraints: Optional[Dict[str, Union[int, Tuple[int, int]]]] = None, - nested_constraints: Optional[Dict[str, Dict[str, int]]] = None, - elementwise_loss: Optional[str] = None, - loss_function: Optional[str] = None, - complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None, - complexity_of_constants: Optional[Union[int, float]] = None, - complexity_of_variables: Optional[Union[int, float]] = None, - complexity_mapping: Optional[str] = None, + maxdepth: int | None = None, + warmup_maxsize_by: float | None = None, + timeout_in_seconds: float | None = None, + constraints: dict[str, int | tuple[int, int]] | None = None, + nested_constraints: dict[str, dict[str, int]] | None = None, + elementwise_loss: str | None = None, + loss_function: str | None = None, + complexity_of_operators: dict[str, int | float] | None = None, + complexity_of_constants: int | float | None = None, + complexity_of_variables: int | float | list[int | float] | None = None, + complexity_mapping: str | None = None, parsimony: float = 0.0, - dimensional_constraint_penalty: Optional[float] = None, + dimensional_constraint_penalty: float | None = None, dimensionless_constants_only: bool = False, use_frequency: bool = True, use_frequency_in_tournament: bool = True, adaptive_parsimony_scaling: float = 1040.0, alpha: float = 3.17, annealing: bool = False, - early_stop_condition: Optional[Union[float, str]] = None, + early_stop_condition: float | str | None = None, ncycles_per_iteration: int = 380, fraction_replaced: float = 0.00036, fraction_replaced_hof: float = 0.0614, @@ -829,21 +825,21 @@ def __init__( should_optimize_constants: bool = True, optimizer_algorithm: Literal["BFGS", "NelderMead"] = "BFGS", optimizer_nrestarts: int = 2, - optimizer_f_calls_limit: Optional[int] = None, + optimizer_f_calls_limit: int | None = None, optimize_probability: float = 0.14, optimizer_iterations: int = 8, perturbation_factor: float = 0.129, probability_negate_constant: float = 0.00743, tournament_selection_n: int = 15, tournament_selection_p: float = 0.982, - parallelism: Optional[ - Literal["serial", "multithreading", "multiprocessing"] - ] = None, - procs: Optional[int] = None, - cluster_manager: Optional[ - Literal["slurm", "pbs", "lsf", "sge", "qrsh", "scyld", "htc"] - ] = None, - heap_size_hint_in_bytes: Optional[int] = None, + parallelism: ( + Literal["serial", "multithreading", "multiprocessing"] | None + ) = None, + procs: int | None = None, + cluster_manager: ( + Literal["slurm", "pbs", "lsf", "sge", "qrsh", "scyld", "htc"] | None + ) = None, + heap_size_hint_in_bytes: int | None = None, batching: bool = False, batch_size: int = 50, fast_cycle: bool = False, @@ -855,22 +851,22 @@ def __init__( deterministic: bool = False, warm_start: bool = False, verbosity: int = 1, - update_verbosity: Optional[int] = None, + update_verbosity: int | None = None, print_precision: int = 5, progress: bool = True, - run_id: Optional[str] = None, - output_directory: Optional[str] = None, + run_id: str | None = None, + output_directory: str | None = None, temp_equation_file: bool = False, - tempdir: Optional[str] = None, + tempdir: str | None = None, delete_tempfiles: bool = True, update: bool = False, output_jax_format: bool = False, output_torch_format: bool = False, - extra_sympy_mappings: Optional[Dict[str, Callable]] = None, - extra_torch_mappings: Optional[Dict[Callable, Callable]] = None, - extra_jax_mappings: Optional[Dict[Callable, str]] = None, + extra_sympy_mappings: dict[str, Callable] | None = None, + extra_torch_mappings: dict[Callable, Callable] | None = None, + extra_jax_mappings: dict[Callable, str] | None = None, denoise: bool = False, - select_k_features: Optional[int] = None, + select_k_features: int | None = None, **kwargs, ): # Hyperparameters @@ -994,7 +990,7 @@ def __init__( ) elif k == "multithreading": # Specific advice given in `_map_parallelism_params` - self.multithreading: Optional[bool] = v + self.multithreading: bool | None = v # Handle kwargs that have been moved to the fit method elif k in ["weights", "variable_names", "Xresampled"]: warnings.warn( @@ -1030,11 +1026,11 @@ def from_file( equation_file: None = None, # Deprecated *, run_directory: PathLike, - binary_operators: Optional[List[str]] = None, - unary_operators: Optional[List[str]] = None, - n_features_in: Optional[int] = None, - feature_names_in: Optional[ArrayLike[str]] = None, - selection_mask: Optional[NDArray[np.bool_]] = None, + binary_operators: list[str] | None = None, + unary_operators: list[str] | None = None, + n_features_in: int | None = None, + feature_names_in: ArrayLike[str] | None = None, + selection_mask: NDArray[np.bool_] | None = None, nout: int = 1, **pysr_kwargs, ) -> "PySRRegressor": @@ -1090,7 +1086,7 @@ def from_file( assert unary_operators is None assert n_features_in is None with open(pkl_filename, "rb") as f: - model: "PySRRegressor" = pkl.load(f) + model: "pysr.sr.PySRRegressor" = pkl.load(f) # Update any parameters if necessary, such as # extra_sympy_mappings: @@ -1190,7 +1186,7 @@ def __repr__(self) -> str: output += "]" return output - def __getstate__(self) -> Dict[str, Any]: + def __getstate__(self) -> dict[str, Any]: """ Handle pickle serialization for PySRRegressor. @@ -1277,7 +1273,7 @@ def julia_options_(self): def julia_state_(self): """The deserialized state.""" return cast( - Optional[Tuple[VectorValue, AnyValue]], + tuple[VectorValue, AnyValue] | None, jl_deserialize(self.julia_state_stream_), ) @@ -1296,8 +1292,8 @@ def expression_spec_(self): return self.expression_spec or ExpressionSpec() def get_best( - self, index: Optional[Union[int, List[int]]] = None - ) -> Union[pd.Series, List[pd.Series]]: + self, index: int | list[int] | None = None + ) -> pd.Series | list[pd.Series]: """ Get best equation using `model_selection`. @@ -1463,15 +1459,15 @@ def _validate_and_set_fit_params( complexity_of_variables, X_units, y_units, - ) -> Tuple[ + ) -> tuple[ ndarray, ndarray, - Optional[ndarray], - Optional[ndarray], + ndarray | None, + ndarray | None, ArrayLike[str], - Optional[Union[int, float, List[Union[int, float]]]], - Optional[ArrayLike[str]], - Optional[Union[str, ArrayLike[str]]], + int | float | list[int | float] | None, + ArrayLike[str] | None, + str | ArrayLike[str] | None, ]: """ Validate the parameters passed to the :term`fit` method. @@ -1604,15 +1600,15 @@ def _validate_and_set_fit_params( y_units, ) - def _validate_data_X_y(self, X: Any, y: Any) -> Tuple[ndarray, ndarray]: + def _validate_data_X_y(self, X: Any, y: Any) -> tuple[ndarray, ndarray]: raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore - return cast(Tuple[ndarray, ndarray], raw_out) + return cast(tuple[ndarray, ndarray], raw_out) def _validate_data_X(self, X: Any) -> ndarray: raw_out = self._validate_data(X=X, reset=False) # type: ignore return cast(ndarray, raw_out) - def _get_precision_mapped_dtype(self, X: np.ndarray) -> Type: + def _get_precision_mapped_dtype(self, X: np.ndarray) -> type: is_complex = np.issubdtype(X.dtype, np.complexfloating) is_real = not is_complex if is_real: @@ -1624,11 +1620,11 @@ def _pre_transform_training_data( self, X: ndarray, y: ndarray, - Xresampled: Union[ndarray, None], + Xresampled: ndarray | None, variable_names: ArrayLike[str], - complexity_of_variables: Optional[Union[int, float, List[Union[int, float]]]], - X_units: Union[ArrayLike[str], None], - y_units: Union[ArrayLike[str], str, None], + complexity_of_variables: int | float | list[int | float] | None, + X_units: ArrayLike[str] | None, + y_units: ArrayLike[str] | str | None, random_state: np.random.RandomState, ): """ @@ -1740,8 +1736,8 @@ def _run( X: ndarray, y: ndarray, runtime_params: _DynamicallySetParams, - weights: Optional[ndarray], - category: Optional[ndarray], + weights: ndarray | None, + category: ndarray | None, seed: int, ): """ @@ -1897,8 +1893,8 @@ def _run( optimize=self.weight_optimize, ) - jl_binary_operators: List[Any] = [] - jl_unary_operators: List[Any] = [] + jl_binary_operators: list[Any] = [] + jl_unary_operators: list[Any] = [] for input_list, output_list, name in [ (binary_operators, jl_binary_operators, "binary"), (unary_operators, jl_unary_operators, "unary"), @@ -2070,13 +2066,11 @@ def fit( *, Xresampled=None, weights=None, - variable_names: Optional[ArrayLike[str]] = None, - complexity_of_variables: Optional[ - Union[int, float, List[Union[int, float]]] - ] = None, - X_units: Optional[ArrayLike[str]] = None, - y_units: Optional[Union[str, ArrayLike[str]]] = None, - category: Optional[ndarray] = None, + variable_names: ArrayLike[str] | None = None, + complexity_of_variables: int | float | list[int | float] | None = None, + X_units: ArrayLike[str] | None = None, + y_units: str | ArrayLike[str] | None = None, + category: ndarray | None = None, ) -> "PySRRegressor": """ Search for equations to fit the dataset and store them in `self.equations_`. @@ -2243,7 +2237,7 @@ def fit( return self - def refresh(self, run_directory: Optional[PathLike] = None) -> None: + def refresh(self, run_directory: PathLike | None = None) -> None: """ Update self.equations_ with any new options passed. @@ -2266,9 +2260,9 @@ def refresh(self, run_directory: Optional[PathLike] = None) -> None: def predict( self, X, - index: Optional[Union[int, List[int]]] = None, + index: int | list[int] | None = None, *, - category: Optional[ndarray] = None, + category: ndarray | None = None, ) -> ndarray: """ Predict y from input X using the equation chosen by `model_selection`. @@ -2363,7 +2357,7 @@ def predict( "You can then run `model.refresh()` to re-load the expressions." ) from error - def sympy(self, index: Optional[Union[int, List[int]]] = None): + def sympy(self, index: int | list[int] | None = None): """ Return sympy representation of the equation(s) chosen by `model_selection`. @@ -2394,8 +2388,8 @@ def sympy(self, index: Optional[Union[int, List[int]]] = None): return best_equation["sympy_format"] def latex( - self, index: Optional[Union[int, List[int]]] = None, precision: int = 3 - ) -> Union[str, List[str]]: + self, index: int | list[int] | None = None, precision: int = 3 + ) -> str | list[str]: """ Return latex representation of the equation(s) chosen by `model_selection`. @@ -2502,7 +2496,7 @@ def pytorch(self, index=None): else: return best_equation["torch_format"] - def get_equation_file(self, i: Optional[int] = None) -> Path: + def get_equation_file(self, i: int | None = None) -> Path: if i is not None: return ( Path(self.output_directory_) @@ -2512,7 +2506,7 @@ def get_equation_file(self, i: Optional[int] = None) -> Path: else: return Path(self.output_directory_) / self.run_id_ / "hall_of_fame.csv" - def _read_equation_file(self) -> List[pd.DataFrame]: + def _read_equation_file(self) -> list[pd.DataFrame]: """Read the hall of fame file created by `SymbolicRegression.jl`.""" try: @@ -2554,9 +2548,7 @@ def _postprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: return df - def get_hof( - self, search_output: Optional[Any] = None - ) -> Union[pd.DataFrame, List[pd.DataFrame]]: + def get_hof(self, search_output=None) -> pd.DataFrame | list[pd.DataFrame]: """Get the equations from a hall of fame file or search output. If no arguments entered, the ones used @@ -2581,7 +2573,7 @@ def get_hof( _validate_export_mappings(self.extra_jax_mappings, self.extra_torch_mappings) - equation_file_contents = cast(List[pd.DataFrame], self.equation_file_contents_) + equation_file_contents = cast(list[pd.DataFrame], self.equation_file_contents_) ret_outputs = [ pd.concat( @@ -2601,9 +2593,9 @@ def get_hof( def latex_table( self, - indices: Optional[List[int]] = None, + indices: list[int] | None = None, precision: int = 3, - columns: List[str] = ["equation", "complexity", "loss", "score"], + columns: list[str] = ["equation", "complexity", "loss", "score"], ) -> str: """Create a LaTeX/booktabs table for all, or some, of the equations. @@ -2711,11 +2703,6 @@ def calculate_scores(df: pd.DataFrame) -> pd.DataFrame: def _mutate_parameter(param_name: str, param_value): - if param_name in ["binary_operators", "unary_operators"] and isinstance( - param_value, str - ): - return [param_value] - if param_name == "batch_size" and param_value < 1: warnings.warn( "Given `batch_size` must be greater than or equal to one. " @@ -2738,10 +2725,10 @@ def _mutate_parameter(param_name: str, param_value): def _map_parallelism_params( - parallelism: Optional[Literal["serial", "multithreading", "multiprocessing"]], - procs: Optional[int], - multithreading: Optional[bool], -) -> Tuple[Literal["serial", "multithreading", "multiprocessing"], Optional[int]]: + parallelism: Literal["serial", "multithreading", "multiprocessing"] | None, + procs: int | None, + multithreading: bool | None, +) -> tuple[Literal["serial", "multithreading", "multiprocessing"], int | None]: """Map old and new parallelism parameters to the new format. Parameters diff --git a/pysr/utils.py b/pysr/utils.py index 9a87fc28e..e18a5b4c3 100644 --- a/pysr/utils.py +++ b/pysr/utils.py @@ -1,25 +1,35 @@ +from __future__ import annotations + import difflib import inspect import re from pathlib import Path -from typing import Any, List, TypeVar, Union +from typing import Any, TypeVar from numpy import ndarray from sklearn.utils.validation import _check_feature_names_in # type: ignore T = TypeVar("T", bound=Any) -ArrayLike = Union[ndarray, List[T]] -PathLike = Union[str, Path] +ArrayLike = ndarray | list[T] +PathLike = str | Path _regexp_im = re.compile(r"\b(\d+\.\d+)im\b") _regexp_im_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)im\b") _regexp_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)\b") -_apply_regexp_im = lambda x: _regexp_im.sub(r"\1j", x) -_apply_regexp_im_sci = lambda x: _regexp_im_sci.sub(r"\1e\2j", x) -_apply_regexp_sci = lambda x: _regexp_sci.sub(r"\1e\2", x) + +def _apply_regexp_im(x: str): + return _regexp_im.sub(r"\1j", x) + + +def _apply_regexp_im_sci(x: str): + return _regexp_im_sci.sub(r"\1e\2j", x) + + +def _apply_regexp_sci(x: str): + return _regexp_sci.sub(r"\1e\2", x) def _preprocess_julia_floats(s: str) -> str: @@ -48,7 +58,7 @@ def _subscriptify(i: int) -> str: return "".join([chr(0x2080 + int(c)) for c in str(i)]) -def _suggest_keywords(cls, k: str) -> List[str]: +def _suggest_keywords(cls, k: str) -> list[str]: valid_keywords = [ param for param in inspect.signature(cls.__init__).parameters From e216da82bace9273c7618c32b797241d21773830 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 29 Nov 2024 22:45:01 +0000 Subject: [PATCH 70/92] fix: dockerfile requirements --- Apptainer.def | 1 - Dockerfile | 4 ---- pysr/test/test_dev_pysr.dockerfile | 4 ---- 3 files changed, 9 deletions(-) diff --git a/Apptainer.def b/Apptainer.def index fa4d87426..baad8a4c8 100644 --- a/Apptainer.def +++ b/Apptainer.def @@ -19,7 +19,6 @@ Stage: runtime /usr/local/julia /usr/local/julia %files - ./requirements.txt /pysr/requirements.txt ./pyproject.toml /pysr/pyproject.toml ./setup.py /pysr/setup.py ./pysr /pysr/pysr diff --git a/Dockerfile b/Dockerfile index e87ce270b..a446f3278 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,10 +17,6 @@ RUN pip install --no-cache-dir ipython matplotlib WORKDIR /pysr -# Caches install (https://stackoverflow.com/questions/25305788/how-to-avoid-reinstalling-packages-when-building-docker-image-for-python-project) -ADD ./requirements.txt /pysr/requirements.txt -RUN pip3 install --no-cache-dir -r /pysr/requirements.txt - # Install PySR: # We do a minimal copy so it doesn't need to rerun at every file change: ADD ./pyproject.toml /pysr/pyproject.toml diff --git a/pysr/test/test_dev_pysr.dockerfile b/pysr/test/test_dev_pysr.dockerfile index 421836a8f..fcde426b9 100644 --- a/pysr/test/test_dev_pysr.dockerfile +++ b/pysr/test/test_dev_pysr.dockerfile @@ -15,10 +15,6 @@ ENV PATH="/usr/local/julia/bin:${PATH}" WORKDIR /pysr -# Caches install (https://stackoverflow.com/questions/25305788/how-to-avoid-reinstalling-packages-when-building-docker-image-for-python-project) -ADD ./requirements.txt /pysr/requirements.txt -RUN pip3 install --no-cache-dir -r /pysr/requirements.txt - # Install PySR: # We do a minimal copy so it doesn't need to rerun at every file change: ADD ./pyproject.toml /pysr/pyproject.toml From c2e7d33616e6643e28bc1519e655eac11dc03ed8 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 29 Nov 2024 23:20:44 +0000 Subject: [PATCH 71/92] ci: separate beartype test --- .github/workflows/CI.yml | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 842d833dc..8c89516dd 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -20,7 +20,6 @@ jobs: timeout-minutes: 60 env: COVERAGE_PROCESS_START: "${{ github.workspace }}/.coveragerc" - PYSR_USE_BEARTYPE: "1" defaults: run: shell: bash @@ -189,3 +188,30 @@ jobs: - name: "Run compatible mypy" run: python -m mypy --ignore-missing-imports pysr if: ${{ matrix.python-version == '3.9' }} + + beartype: + name: Test with beartype + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + env: + PYSR_USE_BEARTYPE: "1" + strategy: + matrix: + python-version: ['3.12'] + os: ['ubuntu-latest'] + + steps: + - uses: actions/checkout@v4 + - name: "Set up Python" + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + - name: "Install PySR and all dependencies" + run: | + python -m pip install --upgrade pip + pip install '.[dev]' + - name: "Run tests" + run: python -m pysr test main,jax,torch From 1eadf50391082c5bb5744538686e9b745d9c6916 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 29 Nov 2024 23:24:22 +0000 Subject: [PATCH 72/92] test: fix some beartype-identified errors --- pysr/export_sympy.py | 2 +- pysr/test/params.py | 7 +++++++ pysr/test/test_main.py | 13 +++++++------ pysr/test/test_torch.py | 13 ++++++++----- 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/pysr/export_sympy.py b/pysr/export_sympy.py index 6f8662059..9f3112e87 100644 --- a/pysr/export_sympy.py +++ b/pysr/export_sympy.py @@ -81,7 +81,7 @@ def create_sympy_symbols( def pysr2sympy( - equation: str, + equation: str | float | int, *, feature_names_in: ArrayLike[str] | None = None, extra_sympy_mappings: dict[str, Callable] | None = None, diff --git a/pysr/test/params.py b/pysr/test/params.py index 54da4ac7d..10de46c99 100644 --- a/pysr/test/params.py +++ b/pysr/test/params.py @@ -1,4 +1,6 @@ import inspect +import os +import unittest from pysr import PySRRegressor @@ -6,3 +8,8 @@ DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default DEFAULT_POPULATIONS = DEFAULT_PARAMS["populations"].default DEFAULT_NCYCLES = DEFAULT_PARAMS["ncycles_per_iteration"].default + +skip_if_beartype = unittest.skipIf( + os.environ.get("PYSR_USE_BEARTYPE", "0") == "1", + "Skipping because beartype would fail test", +) diff --git a/pysr/test/test_main.py b/pysr/test/test_main.py index a6f0b7801..dd1236dd9 100644 --- a/pysr/test/test_main.py +++ b/pysr/test/test_main.py @@ -35,6 +35,7 @@ DEFAULT_NITERATIONS, DEFAULT_PARAMS, DEFAULT_POPULATIONS, + skip_if_beartype, ) # Disables local saving: @@ -354,9 +355,8 @@ def test_warm_start_set_at_init(self): def test_noisy_builtin_variable_names(self): y = self.X[:, [0, 1]] ** 2 + self.rstate.randn(self.X.shape[0], 1) * 0.05 model = PySRRegressor( - # Test that passing a single operator works: - unary_operators="sq(x) = x^2", - binary_operators="plus", + unary_operators=["sq(x) = x^2"], + binary_operators=["plus"], extra_sympy_mappings={"sq": lambda x: x**2}, **self.default_test_kwargs, procs=0, @@ -475,9 +475,8 @@ def test_load_model_simple(self): # Test that we can simply load a model from its equation file. y = self.X[:, [0, 1]] ** 2 model = PySRRegressor( - # Test that passing a single operator works: - unary_operators="sq(x) = x^2", - binary_operators="plus", + unary_operators=["sq(x) = x^2"], + binary_operators=["plus"], extra_sympy_mappings={"sq": lambda x: x**2}, **self.default_test_kwargs, procs=0, @@ -866,6 +865,7 @@ def test_load_all_packages(self): class TestHelpMessages(unittest.TestCase): """Test user help messages.""" + @skip_if_beartype def test_deprecation(self): """Ensure that deprecation works as expected. @@ -972,6 +972,7 @@ def test_bad_variable_names_fail(self): model.fit(X, y, variable_names=["f{c}"]) self.assertIn("Invalid variable name", str(cm.exception)) + @skip_if_beartype def test_bad_kwargs(self): bad_kwargs = [ dict( diff --git a/pysr/test/test_torch.py b/pysr/test/test_torch.py index 512902b24..256d21f86 100644 --- a/pysr/test/test_torch.py +++ b/pysr/test/test_torch.py @@ -141,17 +141,21 @@ def test_custom_operator(self): Path(model.output_directory_) / model.run_id_ / fname ) + MyCustomOperator = sympy.Function("mycustomoperator") + model.set_params( - extra_sympy_mappings={"mycustomoperator": sympy.sin}, - extra_torch_mappings={"mycustomoperator": self.torch.sin}, + extra_sympy_mappings={"mycustomoperator": MyCustomOperator}, + extra_torch_mappings={MyCustomOperator: self.torch.sin}, ) # TODO: We shouldn't need to specify the run directory here. model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_)) - self.assertEqual(str(model.sympy()), "sin(x1)") + # self.assertEqual(str(model.sympy()), "sin(x1)") # Will automatically use the set global state from get_hof. tformat = model.pytorch() - self.assertEqual(str(tformat), "_SingleSymPyModule(expression=sin(x1))") + self.assertEqual( + str(tformat), "_SingleSymPyModule(expression=mycustomoperator(x1))" + ) np.testing.assert_almost_equal( tformat(self.torch.tensor(X)).detach().numpy(), np.sin(X[:, 1]), @@ -204,7 +208,6 @@ def cos_approx(x): maxsize=10, early_stop_condition=1e-5, extra_sympy_mappings={"cos_approx": cos_approx}, - extra_torch_mappings={"cos_approx": cos_approx}, random_state=0, deterministic=True, parallelism="serial", From 147432b79911d3ee5586b2ee5baf7d257059650e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 00:02:54 +0000 Subject: [PATCH 73/92] test: fix jax mapping test --- pysr/test/test_jax.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pysr/test/test_jax.py b/pysr/test/test_jax.py index 98e915315..d261afcf5 100644 --- a/pysr/test/test_jax.py +++ b/pysr/test/test_jax.py @@ -124,6 +124,8 @@ def test_feature_selection_custom_operators(self): def cos_approx(x): return 1 - (x**2) / 2 + (x**4) / 24 + (x**6) / 720 + sp_cos_approx = sympy.Function("cos_approx") + y = X["k15"] ** 2 + 2 * cos_approx(X["k20"]) model = PySRRegressor( @@ -132,9 +134,9 @@ def cos_approx(x): select_k_features=3, maxsize=10, early_stop_condition=1e-5, - extra_sympy_mappings={"cos_approx": cos_approx}, + extra_sympy_mappings={"cos_approx": sp_cos_approx}, extra_jax_mappings={ - "cos_approx": "(lambda x: 1 - x**2 / 2 + x**4 / 24 + x**6 / 720)" + sp_cos_approx: "(lambda x: 1 - x**2 / 2 + x**4 / 24 + x**6 / 720)" }, random_state=0, deterministic=True, @@ -143,14 +145,8 @@ def cos_approx(x): np.random.seed(0) model.fit(X.values, y.values) f, parameters = model.jax().values() - - np_prediction = model.predict jax_prediction = partial(f, parameters=parameters) - - np_output = np_prediction(X.values) jax_output = jax_prediction(X.values) - - np.testing.assert_almost_equal(y.values, np_output, decimal=3) np.testing.assert_almost_equal(y.values, jax_output, decimal=3) From c5404f8242a67653e0020e1a9fe741ec34237fde Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 00:35:19 +0000 Subject: [PATCH 74/92] deps: require Python >= 3.10 --- .github/workflows/CI.yml | 10 +++++----- .github/workflows/CI_conda_forge.yml | 6 +++--- .github/workflows/CI_docker_large_nightly.yml | 4 ++-- .github/workflows/CI_large_nightly.yml | 4 ++-- environment.yml | 3 +-- pyproject.toml | 3 +-- pysr/denoising.py | 2 -- pysr/export.py | 9 +-------- pysr/export_latex.py | 2 -- pysr/export_numpy.py | 2 -- pysr/export_sympy.py | 9 +-------- pysr/expression_specs.py | 10 +--------- pysr/feature_selection.py | 2 -- pysr/julia_helpers.py | 8 +------- pysr/sr.py | 8 +------- 15 files changed, 19 insertions(+), 63 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 8c89516dd..b2659fde7 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -31,7 +31,7 @@ jobs: test-id: [main] include: - julia-version: '1.10' - python-version: '3.9' + python-version: '3.10' os: ubuntu-latest test-id: include - julia-version: '1' @@ -89,7 +89,7 @@ jobs: julia-version: ['1'] include: - os: ubuntu-latest - python-version: '3.9' + python-version: '3.10' julia-version: '1.10' steps: - uses: actions/checkout@v4 @@ -168,7 +168,7 @@ jobs: matrix: python-version: - '3.12' - - '3.9' + - '3.10' os: ['ubuntu-latest'] steps: @@ -184,10 +184,10 @@ jobs: pip install '.[dev]' - name: "Run mypy" run: python -m mypy --install-types --non-interactive pysr - if: ${{ matrix.python-version != '3.9' }} + if: ${{ matrix.python-version != '3.10' }} - name: "Run compatible mypy" run: python -m mypy --ignore-missing-imports pysr - if: ${{ matrix.python-version == '3.9' }} + if: ${{ matrix.python-version == '3.10' }} beartype: name: Test with beartype diff --git a/.github/workflows/CI_conda_forge.yml b/.github/workflows/CI_conda_forge.yml index 98d064b93..097a22bc2 100644 --- a/.github/workflows/CI_conda_forge.yml +++ b/.github/workflows/CI_conda_forge.yml @@ -20,17 +20,17 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.9', '3.10', '3.11', '3.12'] + python-version: ['3.10', '3.11', '3.12'] os: ['ubuntu-latest'] use-mamba: [true, false] include: - - python-version: 3.9 + - python-version: 3.10 os: 'windows-latest' use-mamba: true - python-version: 3.12 os: 'windows-latest' use-mamba: true - - python-version: 3.9 + - python-version: 3.10 os: 'macos-latest' use-mamba: true - python-version: 3.12 diff --git a/.github/workflows/CI_docker_large_nightly.yml b/.github/workflows/CI_docker_large_nightly.yml index 35d15fbaa..4e6dbf8ba 100644 --- a/.github/workflows/CI_docker_large_nightly.yml +++ b/.github/workflows/CI_docker_large_nightly.yml @@ -18,8 +18,8 @@ jobs: strategy: fail-fast: false matrix: - julia-version: ['1.6', '1'] - python-version: ['3.9', '3.12'] + julia-version: ['1.10', '1'] + python-version: ['3.10', '3.12'] os: [ubuntu-latest] arch: ['linux/amd64', 'linux/arm64'] diff --git a/.github/workflows/CI_large_nightly.yml b/.github/workflows/CI_large_nightly.yml index 369d4bf47..4598468bf 100644 --- a/.github/workflows/CI_large_nightly.yml +++ b/.github/workflows/CI_large_nightly.yml @@ -23,8 +23,8 @@ jobs: strategy: fail-fast: false matrix: - julia-version: ['1.6', '1.8', '1.10'] - python-version: ['3.9', '3.10', '3.12'] + julia-version: ['1.10', '1'] + python-version: ['3.10', '3.12'] os: [ubuntu-latest, macos-latest, windows-latest] steps: diff --git a/environment.yml b/environment.yml index 840b6a1c3..fbed19f6f 100644 --- a/environment.yml +++ b/environment.yml @@ -2,11 +2,10 @@ name: test channels: - conda-forge dependencies: - - python>=3.9 + - python>=3.10 - sympy>=1.0.0,<2.0.0 - pandas>=0.21.0,<3.0.0 - numpy>=1.13.0,<2.0.0 - scikit-learn>=1.0.0,<2.0.0 - pyjuliacall>=0.9.21,<0.9.22 - click>=7.0.0,<9.0.0 - - typing-extensions>=4.0.0,<5.0.0 diff --git a/pyproject.toml b/pyproject.toml index c9ed895f0..c3ed96227 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ description = "Simple and efficient symbolic regression" readme = {file = "README.md", content-type = "text/markdown"} license = {file = "LICENSE"} -requires-python = ">=3.9" +requires-python = ">=3.10" classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", @@ -25,7 +25,6 @@ dependencies = [ "juliacall==0.9.23", "click>=7.0.0,<9.0.0", "setuptools>=50.0.0", - "typing-extensions>=4.0.0,<5.0.0", ] [project.optional-dependencies] diff --git a/pysr/denoising.py b/pysr/denoising.py index fbf17468b..92a640ee8 100644 --- a/pysr/denoising.py +++ b/pysr/denoising.py @@ -1,7 +1,5 @@ """Functions for denoising data during preprocessing.""" -from __future__ import annotations - from typing import cast import numpy as np diff --git a/pysr/export.py b/pysr/export.py index bc88b7e15..c1b589b84 100644 --- a/pysr/export.py +++ b/pysr/export.py @@ -1,12 +1,5 @@ -from __future__ import annotations - import copy -import sys - -if sys.version_info >= (3, 10): - from collections.abc import Callable -else: - from typing import Callable +from collections.abc import Callable import numpy as np import pandas as pd diff --git a/pysr/export_latex.py b/pysr/export_latex.py index f00fc48ad..55d24de0b 100644 --- a/pysr/export_latex.py +++ b/pysr/export_latex.py @@ -1,7 +1,5 @@ """Functions to help export PySR equations to LaTeX.""" -from __future__ import annotations - import pandas as pd import sympy # type: ignore from sympy.printing.latex import LatexPrinter # type: ignore diff --git a/pysr/export_numpy.py b/pysr/export_numpy.py index c67834985..163726d6f 100644 --- a/pysr/export_numpy.py +++ b/pysr/export_numpy.py @@ -1,7 +1,5 @@ """Code for exporting discovered expressions to numpy""" -from __future__ import annotations - import warnings import numpy as np diff --git a/pysr/export_sympy.py b/pysr/export_sympy.py index 9f3112e87..ea54b01c6 100644 --- a/pysr/export_sympy.py +++ b/pysr/export_sympy.py @@ -1,13 +1,6 @@ """Define utilities to export to sympy""" -from __future__ import annotations - -import sys - -if sys.version_info >= (3, 10): - from collections.abc import Callable -else: - from typing import Callable +from collections.abc import Callable import sympy # type: ignore from sympy import sympify diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py index b7103d357..e86064785 100644 --- a/pysr/expression_specs.py +++ b/pysr/expression_specs.py @@ -1,14 +1,6 @@ -from __future__ import annotations - import copy -import sys from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, NewType - -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias +from typing import TYPE_CHECKING, Any, NewType, TypeAlias import numpy as np import pandas as pd diff --git a/pysr/feature_selection.py b/pysr/feature_selection.py index a0e4d2072..8c8358fdd 100644 --- a/pysr/feature_selection.py +++ b/pysr/feature_selection.py @@ -1,7 +1,5 @@ """Functions for doing feature selection during preprocessing.""" -from __future__ import annotations - from typing import cast import numpy as np diff --git a/pysr/julia_helpers.py b/pysr/julia_helpers.py index 917c4fd3d..a93b6265d 100644 --- a/pysr/julia_helpers.py +++ b/pysr/julia_helpers.py @@ -1,12 +1,6 @@ """Functions for initializing the Julia environment and installing deps.""" -import sys -from typing import Any, cast, overload - -if sys.version_info >= (3, 10): - from typing import Callable -else: - from collections.abc import Callable +from typing import Any, Callable, cast, overload import numpy as np from juliacall import convert as jl_convert # type: ignore diff --git a/pysr/sr.py b/pysr/sr.py index 6dc5169fe..2c7d76534 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1,7 +1,5 @@ """Define the PySRRegressor scikit-learn interface.""" -from __future__ import annotations - import copy import os import pickle as pkl @@ -9,17 +7,13 @@ import sys import tempfile import warnings +from collections.abc import Callable from dataclasses import dataclass, fields from io import StringIO from multiprocessing import cpu_count from pathlib import Path from typing import Any, Literal, cast -if sys.version_info >= (3, 10): - from collections.abc import Callable -else: - from typing import Callable - import numpy as np import pandas as pd from numpy import ndarray From 95fb9d1ca12e4f497c3e01e5fe719eef40dd9ca6 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 00:52:26 +0000 Subject: [PATCH 75/92] deps: fix pyjuliacall version --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index fbed19f6f..994390c98 100644 --- a/environment.yml +++ b/environment.yml @@ -7,5 +7,5 @@ dependencies: - pandas>=0.21.0,<3.0.0 - numpy>=1.13.0,<2.0.0 - scikit-learn>=1.0.0,<2.0.0 - - pyjuliacall>=0.9.21,<0.9.22 + - pyjuliacall>=0.9.22,<0.9.23 - click>=7.0.0,<9.0.0 From 762d8f9e93daf0076a7137149c9ccc8b207ed47f Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 01:02:04 +0000 Subject: [PATCH 76/92] test: fix type annotation --- pysr/sr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index 2c7d76534..d196ded84 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -1080,7 +1080,7 @@ def from_file( assert unary_operators is None assert n_features_in is None with open(pkl_filename, "rb") as f: - model: "pysr.sr.PySRRegressor" = pkl.load(f) + model = cast("PySRRegressor", pkl.load(f)) # Update any parameters if necessary, such as # extra_sympy_mappings: From 9e87e0377eb4112777771e879764690f80b38412 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 01:26:54 +0000 Subject: [PATCH 77/92] ci: tweak conda backend --- .github/workflows/CI.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index b2659fde7..b470772ec 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -123,7 +123,7 @@ jobs: - name: "Set up Conda" uses: conda-incubator/setup-miniconda@v3 with: - miniforge-variant: Mambaforge + miniforge-variant: Miniforge3 miniforge-version: latest auto-activate-base: true python-version: ${{ matrix.python-version }} From 324d1f696ca8c0a8d5a7bbfcc6af0ac4feffc9f2 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 01:52:12 +0000 Subject: [PATCH 78/92] refactor: more typing info --- pysr/sr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysr/sr.py b/pysr/sr.py index d196ded84..37e449c72 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -841,7 +841,7 @@ def __init__( bumper: bool = False, precision: Literal[16, 32, 64] = 32, enable_autodiff: bool = False, - random_state=None, + random_state: int | np.random.RandomState | None = None, deterministic: bool = False, warm_start: bool = False, verbosity: int = 1, From 4dc4e6fbae81f4ebbd8bd68ee307863e9abdc0d2 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 02:50:44 +0000 Subject: [PATCH 79/92] docs: add example with template expression --- docs/_api.md | 20 +++++++++++ docs/examples.md | 88 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 107 insertions(+), 1 deletion(-) diff --git a/docs/_api.md b/docs/_api.md index bbc6c1f2f..864a34ad4 100644 --- a/docs/_api.md +++ b/docs/_api.md @@ -60,3 +60,23 @@ PARAMSKEY show_root_heading: true heading_level: 3 show_root_full_path: false + +## Expression Specifications + +::: pysr.ExpressionSpec + options: + show_root_heading: true + heading_level: 3 + show_root_full_path: false + +::: pysr.TemplateExpressionSpec + options: + show_root_heading: true + heading_level: 3 + show_root_full_path: false + +::: pysr.ParametricExpressionSpec + options: + show_root_heading: true + heading_level: 3 + show_root_full_path: false diff --git a/docs/examples.md b/docs/examples.md index b666c4102..db50d1c37 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -523,7 +523,93 @@ Note that this expression has a large dynamic range so may be difficult to find. Note that you can also search for exclusively dimensionless constants by settings `dimensionless_constants_only` to `true`. -## 11. Additional features +## 11. Expression Specifications + +PySR 1.0 introduces powerful expression specifications that allow you to define structured equations. Here are two examples: + +### Template Expressions + +`TemplateExpressionSpec` allows you to define a specific structure for the equation. +For example, let's say we want to learn an equation of the form: + +$$ y = \sin(f(x_1, x_2)) + g(x_3) $$ + +We can do this as follows: + +```python +import numpy as np +from pysr import PySRRegressor, TemplateExpressionSpec + +# Create data +X = np.random.randn(1000, 3) +y = np.sin(X[:, 0] + X[:, 1]) + X[:, 2]**2 + +# Define template: we want sin(f(x1, x2)) + g(x3) +template = TemplateExpressionSpec( + function_symbols=["f", "g"], + combine="((; f, g), (x1, x2, x3)) -> sin(f(x1, x2)) + g(x3)", +) + +model = PySRRegressor( + expression_spec=template, + binary_operators=["+", "*", "-", "/"], + unary_operators=["sin"], + maxsize=10, +) +model.fit(X, y) +``` + +You can also use no argument-functions for learning constants, like: + +```python +template = TemplateExpressionSpec( + function_symbols=["a", "f"], + combine="((; a, f), (x, y)) -> a() * sin(f(x, y))", +) +``` + +### Parametric Expressions + +When your data has categories with shared equation structure but different parameters, +you can use a `ParametricExpressionSpec`. Let's say we would like to learn the expression: + +$$ y = \alpha \sin(x_1) + \beta $$ + +for three different values of $\alpha$ and $\beta$. + +```python +import numpy as np +from pysr import PySRRegressor, ParametricExpressionSpec + +# Create data with 3 categories +X = np.random.uniform(-3, 3, (1000, 2)) +category = np.random.randint(0, 3, 1000) + +# Parameters for each category +offsets = [0.1, 1.5, -0.5] +scales = [1.0, 2.0, 0.5] + +# y = scale[category] * sin(x1) + offset[category] +y = np.array([ + scales[c] * np.sin(x1) + offsets[c] + for x1, c in zip(X[:, 0], category) +]) + +model = PySRRegressor( + expression_spec=ParametricExpressionSpec(max_parameters=2), + binary_operators=["+", "*", "-", "/"], + unary_operators=["sin"], + maxsize=10, +) +model.fit(X, y, category=category) + +# Predicting on new data: +# model.predict(X_test, category=category_test) +``` + +See [Expression Specifications](expression-specs.md) for more details. + +## 12. Additional features For the many other features available in PySR, please read the [Options section](options.md). From 2f2fba5a9a4ebf9aa7e729c232691c117272c506 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 03:10:02 +0000 Subject: [PATCH 80/92] feat: introduce `autodiff_backend` --- pysr/julia_extensions.py | 8 ++++---- pysr/param_groupings.yml | 2 +- pysr/sr.py | 23 ++++++++++++++--------- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/pysr/julia_extensions.py b/pysr/julia_extensions.py index 54c01c6a3..a07caf36b 100644 --- a/pysr/julia_extensions.py +++ b/pysr/julia_extensions.py @@ -1,6 +1,6 @@ """This file installs and loads extensions for SymbolicRegression.""" -from __future__ import annotations +from typing import Literal from .julia_import import Pkg, jl @@ -9,14 +9,14 @@ def load_required_packages( *, turbo: bool = False, bumper: bool = False, - enable_autodiff: bool = False, + autodiff_backend: Literal["Zygote"] | None = None, cluster_manager: str | None = None, ): if turbo: load_package("LoopVectorization", "bdcacae8-1622-11e9-2a5c-532679323890") if bumper: load_package("Bumper", "8ce10254-0962-460f-a3d8-1f77fea1446e") - if enable_autodiff: + if autodiff_backend is not None: load_package("Zygote", "e88e6eb3-aa80-5325-afca-941959d7151f") if cluster_manager is not None: load_package("ClusterManagers", "34f1f09b-3a8b-5176-ab39-66d58a4d544e") @@ -25,7 +25,7 @@ def load_required_packages( def load_all_packages(): """Install and load all Julia extensions available to PySR.""" load_required_packages( - turbo=True, bumper=True, enable_autodiff=True, cluster_manager="slurm" + turbo=True, bumper=True, autodiff_backend="Zygote", cluster_manager="slurm" ) diff --git a/pysr/param_groupings.yml b/pysr/param_groupings.yml index 98ebee250..c76dc7e93 100644 --- a/pysr/param_groupings.yml +++ b/pysr/param_groupings.yml @@ -81,7 +81,7 @@ - fast_cycle - turbo - bumper - - enable_autodiff + - autodiff_backend - Determinism: - random_state - deterministic diff --git a/pysr/sr.py b/pysr/sr.py index 37e449c72..0fff73ff1 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -576,11 +576,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): If you pass complex data, the corresponding complex precision will be used (i.e., `64` for complex128, `32` for complex64). Default is `32`. - enable_autodiff : bool - Whether to create derivative versions of operators for automatic - differentiation. This is only necessary if you wish to compute - the gradients of an expression within a custom loss function. - Default is `False`. + autodiff_backend : Literal["Zygote"] | None + Which backend to use for automatic differentiation during constant + optimization. Currently only `"Zygote"` is supported. The default, + `None`, uses forward-mode or finite difference. + Default is `None`. random_state : int, Numpy RandomState instance or None Pass an int for reproducible results across multiple function calls. See :term:`Glossary `. @@ -840,7 +840,7 @@ def __init__( turbo: bool = False, bumper: bool = False, precision: Literal[16, 32, 64] = 32, - enable_autodiff: bool = False, + autodiff_backend: Literal["Zygote"] | None = None, random_state: int | np.random.RandomState | None = None, deterministic: bool = False, warm_start: bool = False, @@ -943,7 +943,7 @@ def __init__( self.turbo = turbo self.bumper = bumper self.precision = precision - self.enable_autodiff = enable_autodiff + self.autodiff_backend = autodiff_backend self.random_state = random_state self.deterministic = deterministic self.warm_start = warm_start @@ -1869,10 +1869,15 @@ def _run( load_required_packages( turbo=self.turbo, bumper=self.bumper, - enable_autodiff=self.enable_autodiff, + autodiff_backend=self.autodiff_backend, cluster_manager=cluster_manager, ) + if self.autodiff_backend is not None: + autodiff_backend = jl.Symbol(self.autodiff_backend) + else: + autodiff_backend = None + mutation_weights = SymbolicRegression.MutationWeights( mutate_constant=self.weight_mutate_constant, mutate_operator=self.weight_mutate_operator, @@ -1940,7 +1945,7 @@ def _run( fast_cycle=self.fast_cycle, turbo=self.turbo, bumper=self.bumper, - enable_autodiff=self.enable_autodiff, + autodiff_backend=autodiff_backend, migration=self.migration, hof_migration=self.hof_migration, fraction_replaced_hof=self.fraction_replaced_hof, From 5d154a18fd2d124ec2504d462ffae7557b6af981 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 19:13:39 +0000 Subject: [PATCH 81/92] refactor: removed deprecated pythoncall commands --- pysr/sr.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 0fff73ff1..4b37c2ae8 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -40,7 +40,6 @@ from .feature_selection import run_feature_selection from .julia_extensions import load_required_packages from .julia_helpers import ( - PythonCall, _escape_filename, _load_cluster_manager, jl_array, @@ -2016,7 +2015,6 @@ def _run( else: jl_y_variable_names = None - PythonCall.GC.disable() out = SymbolicRegression.equation_search( jl_X, jl_y, @@ -2047,7 +2045,6 @@ def _run( and len(y.shape) == 1, verbosity=int(self.verbosity), ) - PythonCall.GC.enable() self.julia_state_stream_ = jl_serialize(out) From 7a1cd8a2db264ab804794461f136b11625738e33 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 19:56:30 +0000 Subject: [PATCH 82/92] feat: create `TensorBoardLoggerSpec` --- pysr/__init__.py | 3 +++ pysr/julia_extensions.py | 10 ++++++++- pysr/logger_specs.py | 47 ++++++++++++++++++++++++++++++++++++++++ pysr/sr.py | 13 ++++++++++- 4 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 pysr/logger_specs.py diff --git a/pysr/__init__.py b/pysr/__init__.py index 4a13af3f3..e26174aba 100644 --- a/pysr/__init__.py +++ b/pysr/__init__.py @@ -21,6 +21,7 @@ TemplateExpressionSpec, ) from .julia_extensions import load_all_packages +from .logger_specs import AbstractLoggerSpec, TensorBoardLoggerSpec from .sr import PySRRegressor # This file is created by setuptools_scm during the build process: @@ -39,6 +40,8 @@ "ExpressionSpec", "TemplateExpressionSpec", "ParametricExpressionSpec", + "AbstractLoggerSpec", + "TensorBoardLoggerSpec", "best", "best_callable", "best_row", diff --git a/pysr/julia_extensions.py b/pysr/julia_extensions.py index a07caf36b..db29d27e1 100644 --- a/pysr/julia_extensions.py +++ b/pysr/julia_extensions.py @@ -3,6 +3,7 @@ from typing import Literal from .julia_import import Pkg, jl +from .logger_specs import AbstractLoggerSpec, TensorBoardLoggerSpec def load_required_packages( @@ -11,6 +12,7 @@ def load_required_packages( bumper: bool = False, autodiff_backend: Literal["Zygote"] | None = None, cluster_manager: str | None = None, + logger: AbstractLoggerSpec | None = None, ): if turbo: load_package("LoopVectorization", "bdcacae8-1622-11e9-2a5c-532679323890") @@ -20,12 +22,18 @@ def load_required_packages( load_package("Zygote", "e88e6eb3-aa80-5325-afca-941959d7151f") if cluster_manager is not None: load_package("ClusterManagers", "34f1f09b-3a8b-5176-ab39-66d58a4d544e") + if isinstance(logger, TensorBoardLoggerSpec): + load_package("TensorBoardLogger", "899adc3e-224a-11e9-021f-63837185c80f") def load_all_packages(): """Install and load all Julia extensions available to PySR.""" load_required_packages( - turbo=True, bumper=True, autodiff_backend="Zygote", cluster_manager="slurm" + turbo=True, + bumper=True, + autodiff_backend="Zygote", + cluster_manager="slurm", + logger=TensorBoardLoggerSpec(log_dir="logs"), ) diff --git a/pysr/logger_specs.py b/pysr/logger_specs.py new file mode 100644 index 000000000..38341fa5f --- /dev/null +++ b/pysr/logger_specs.py @@ -0,0 +1,47 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass + +from .julia_import import AnyValue, jl + + +class AbstractLoggerSpec(ABC): + """Abstract base class for logger specifications.""" + + @abstractmethod + def create_logger(self) -> AnyValue: + """Create a logger instance.""" + pass + + +@dataclass +class TensorBoardLoggerSpec(AbstractLoggerSpec): + """Specification for TensorBoard logger. + + Attributes: + ---------- + log_dir : str + Directory where TensorBoard logs will be saved. + log_interval : int, optional + Interval (in steps) at which logs are written. Default is 2. + overwrite : bool, optional + Whether to overwrite existing logs in the directory. Default is True. + """ + + log_dir: str + log_interval: int = 2 + overwrite: bool = True + + def create_logger(self) -> AnyValue: + make_logger = jl.seval( + """ + function make_logger(log_dir::AbstractString, overwrite::Bool, log_interval::Int) + base_logger = TensorBoardLogger.TBLogger( + log_dir, + (overwrite ? (TensorBoardLogger.tb_overwrite,) : ())... + ) + return SRLogger(; logger=base_logger, log_interval) + end + """ + ) + log_dir = str(self.log_dir) + return make_logger(log_dir, self.overwrite, self.log_interval) diff --git a/pysr/sr.py b/pysr/sr.py index 4b37c2ae8..1bd3db195 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -48,6 +48,7 @@ jl_serialize, ) from .julia_import import AnyValue, SymbolicRegression, VectorValue, jl +from .logger_specs import AbstractLoggerSpec from .utils import ( ArrayLike, PathLike, @@ -379,7 +380,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Default is `None`. complexity_of_constants : int | float Complexity of constants. Default is `1`. - complexity_of_variables : int | float + complexity_of_variables : int | float | list[int | float] Global complexity of variables. To set different complexities for different variables, pass a list of complexities to the `fit` method with keyword `complexity_of_variables`. You cannot use both. @@ -606,6 +607,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): progress : bool Whether to use a progress bar instead of printing to stdout. Default is `True`. + logger: AbstractLoggerSpec | None + Logger specification for the Julia backend. See, for example, + `TensorBoardLoggerSpec`. + Default is `None`. run_id : str A unique identifier for the run. Will be generated using the current date and time if not provided. @@ -847,6 +852,7 @@ def __init__( update_verbosity: int | None = None, print_precision: int = 5, progress: bool = True, + logger: AbstractLoggerSpec | None = None, run_id: str | None = None, output_directory: str | None = None, temp_equation_file: bool = False, @@ -952,6 +958,7 @@ def __init__( self.update_verbosity = update_verbosity self.print_precision = print_precision self.progress = progress + self.logger = logger # - Project management self.run_id = run_id self.output_directory = output_directory @@ -1870,6 +1877,7 @@ def _run( bumper=self.bumper, autodiff_backend=self.autodiff_backend, cluster_manager=cluster_manager, + logger=self.logger, ) if self.autodiff_backend is not None: @@ -1909,6 +1917,8 @@ def _run( jl.seval(self.complexity_mapping) if self.complexity_mapping else None ) + logger = self.logger.create_logger() if self.logger else None + # Call to Julia backend. # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl options = SymbolicRegression.Options( @@ -2044,6 +2054,7 @@ def _run( and self.verbosity > 0 and len(y.shape) == 1, verbosity=int(self.verbosity), + logger=logger, ) self.julia_state_stream_ = jl_serialize(out) From 23de62118da79c7d9088dca60bb5aa42852139f3 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 20:04:22 +0000 Subject: [PATCH 83/92] feat: tweak defaults of TensorBoardLoggerSpec --- pysr/logger_specs.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pysr/logger_specs.py b/pysr/logger_specs.py index 38341fa5f..27a332dbf 100644 --- a/pysr/logger_specs.py +++ b/pysr/logger_specs.py @@ -20,16 +20,17 @@ class TensorBoardLoggerSpec(AbstractLoggerSpec): Attributes: ---------- log_dir : str - Directory where TensorBoard logs will be saved. + Directory where TensorBoard logs will be saved. If `overwrite` is `False`, + new logs will be saved to `{log_dir}_1`, and so on. Default is `"logs/run"`. log_interval : int, optional Interval (in steps) at which logs are written. Default is 2. overwrite : bool, optional - Whether to overwrite existing logs in the directory. Default is True. + Whether to overwrite existing logs in the directory. Default is False. """ - log_dir: str + log_dir: str = "logs/run" log_interval: int = 2 - overwrite: bool = True + overwrite: bool = False def create_logger(self) -> AnyValue: make_logger = jl.seval( From 5690c4e9354a8a602d540b2818c477b04a1ab00d Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 20:08:30 +0000 Subject: [PATCH 84/92] refactor: use name `logger_spec` instead of `logger` --- pysr/julia_extensions.py | 6 +++--- pysr/logger_specs.py | 4 ++++ pysr/sr.py | 13 ++++++++----- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/pysr/julia_extensions.py b/pysr/julia_extensions.py index db29d27e1..72273f3e6 100644 --- a/pysr/julia_extensions.py +++ b/pysr/julia_extensions.py @@ -12,7 +12,7 @@ def load_required_packages( bumper: bool = False, autodiff_backend: Literal["Zygote"] | None = None, cluster_manager: str | None = None, - logger: AbstractLoggerSpec | None = None, + logger_spec: AbstractLoggerSpec | None = None, ): if turbo: load_package("LoopVectorization", "bdcacae8-1622-11e9-2a5c-532679323890") @@ -22,7 +22,7 @@ def load_required_packages( load_package("Zygote", "e88e6eb3-aa80-5325-afca-941959d7151f") if cluster_manager is not None: load_package("ClusterManagers", "34f1f09b-3a8b-5176-ab39-66d58a4d544e") - if isinstance(logger, TensorBoardLoggerSpec): + if isinstance(logger_spec, TensorBoardLoggerSpec): load_package("TensorBoardLogger", "899adc3e-224a-11e9-021f-63837185c80f") @@ -33,7 +33,7 @@ def load_all_packages(): bumper=True, autodiff_backend="Zygote", cluster_manager="slurm", - logger=TensorBoardLoggerSpec(log_dir="logs"), + logger_spec=TensorBoardLoggerSpec(log_dir="logs"), ) diff --git a/pysr/logger_specs.py b/pysr/logger_specs.py index 27a332dbf..49bcba1af 100644 --- a/pysr/logger_specs.py +++ b/pysr/logger_specs.py @@ -12,6 +12,10 @@ def create_logger(self) -> AnyValue: """Create a logger instance.""" pass + def close(self, logger: AnyValue) -> None: + """Close the logger.""" + jl.close(logger) + @dataclass class TensorBoardLoggerSpec(AbstractLoggerSpec): diff --git a/pysr/sr.py b/pysr/sr.py index 1bd3db195..172ffb1fd 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -607,7 +607,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): progress : bool Whether to use a progress bar instead of printing to stdout. Default is `True`. - logger: AbstractLoggerSpec | None + logger_spec: AbstractLoggerSpec | None Logger specification for the Julia backend. See, for example, `TensorBoardLoggerSpec`. Default is `None`. @@ -852,7 +852,7 @@ def __init__( update_verbosity: int | None = None, print_precision: int = 5, progress: bool = True, - logger: AbstractLoggerSpec | None = None, + logger_spec: AbstractLoggerSpec | None = None, run_id: str | None = None, output_directory: str | None = None, temp_equation_file: bool = False, @@ -958,7 +958,7 @@ def __init__( self.update_verbosity = update_verbosity self.print_precision = print_precision self.progress = progress - self.logger = logger + self.logger_spec = logger_spec # - Project management self.run_id = run_id self.output_directory = output_directory @@ -1877,7 +1877,7 @@ def _run( bumper=self.bumper, autodiff_backend=self.autodiff_backend, cluster_manager=cluster_manager, - logger=self.logger, + logger_spec=self.logger_spec, ) if self.autodiff_backend is not None: @@ -1917,7 +1917,7 @@ def _run( jl.seval(self.complexity_mapping) if self.complexity_mapping else None ) - logger = self.logger.create_logger() if self.logger else None + logger = self.logger_spec.create_logger() if self.logger_spec else None # Call to Julia backend. # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl @@ -2059,6 +2059,9 @@ def _run( self.julia_state_stream_ = jl_serialize(out) + if logger: + self.logger_spec.close(logger) + # Set attributes self.equations_ = self.get_hof(out) From 8eb512a3e340c3ef40af67244a8263563c174ad4 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 21:08:10 +0000 Subject: [PATCH 85/92] feat: log hyperparams in tensorboard --- pysr/julia_helpers.py | 4 ++++ pysr/logger_specs.py | 34 +++++++++++++++++++++++++++++----- pysr/sr.py | 5 ++--- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/pysr/julia_helpers.py b/pysr/julia_helpers.py index a93b6265d..ef82be902 100644 --- a/pysr/julia_helpers.py +++ b/pysr/julia_helpers.py @@ -43,6 +43,10 @@ def jl_array(x, dtype=None): return jl_convert(jl.Array[dtype], x) +def jl_dict(x): + return jl_convert(jl.Dict, x) + + def jl_is_function(f) -> bool: return cast(bool, jl.seval("op -> op isa Function")(f)) diff --git a/pysr/logger_specs.py b/pysr/logger_specs.py index 49bcba1af..008999eb4 100644 --- a/pysr/logger_specs.py +++ b/pysr/logger_specs.py @@ -1,6 +1,8 @@ from abc import ABC, abstractmethod from dataclasses import dataclass +from typing import Any +from .julia_helpers import jl_array, jl_dict from .julia_import import AnyValue, jl @@ -12,9 +14,10 @@ def create_logger(self) -> AnyValue: """Create a logger instance.""" pass - def close(self, logger: AnyValue) -> None: - """Close the logger.""" - jl.close(logger) + @abstractmethod + def write_hparams(self, logger: AnyValue, hparams: dict[str, Any]) -> None: + """Write hyperparameters to the logger.""" + pass @dataclass @@ -27,16 +30,17 @@ class TensorBoardLoggerSpec(AbstractLoggerSpec): Directory where TensorBoard logs will be saved. If `overwrite` is `False`, new logs will be saved to `{log_dir}_1`, and so on. Default is `"logs/run"`. log_interval : int, optional - Interval (in steps) at which logs are written. Default is 2. + Interval (in steps) at which logs are written. Default is 10. overwrite : bool, optional Whether to overwrite existing logs in the directory. Default is False. """ log_dir: str = "logs/run" - log_interval: int = 2 + log_interval: int = 10 overwrite: bool = False def create_logger(self) -> AnyValue: + # We assume that TensorBoardLogger is already imported via `julia_extensions.py` make_logger = jl.seval( """ function make_logger(log_dir::AbstractString, overwrite::Bool, log_interval::Int) @@ -50,3 +54,23 @@ def create_logger(self) -> AnyValue: ) log_dir = str(self.log_dir) return make_logger(log_dir, self.overwrite, self.log_interval) + + def write_hparams(self, logger: AnyValue, hparams: dict[str, Any]) -> None: + base_logger = jl.SymbolicRegression.get_logger(logger) + writer = jl.seval("TensorBoardLogger.write_hparams!") + jl_clean_hparams = jl_dict( + { + k: (v if isinstance(v, (bool, int, float)) else str(v)) + for k, v in hparams.items() + } + ) + writer( + base_logger, + jl_clean_hparams, + jl_array( + [ + "search/data/summaries/pareto_volume", + "search/data/summaries/min_loss", + ], + ), + ) diff --git a/pysr/sr.py b/pysr/sr.py index 172ffb1fd..8d8787c66 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2056,12 +2056,11 @@ def _run( verbosity=int(self.verbosity), logger=logger, ) + if logger: + self.logger_spec.write_hparams(logger, self.get_params()) self.julia_state_stream_ = jl_serialize(out) - if logger: - self.logger_spec.close(logger) - # Set attributes self.equations_ = self.get_hof(out) From 2c1c8ed717e0b7c66615703927e8b03db48e0989 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 21:13:57 +0000 Subject: [PATCH 86/92] chore: add tensorboard to dev env --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c3ed96227..b496a2213 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,11 +40,12 @@ dev = [ "nbval>=0.11,<0.12", "pandas-stubs", "pre-commit>=3.7,<5", + "pytest-cov>=5,<7", "pytest>=8,<9", + "tensorboard>=2,<3", "torch>=2,<3", "types-openpyxl", "types-pytz", - "pytest-cov>=5,<7", ] [tool.setuptools] From a5161fc3d8270616d28c835532189133ed2de2a9 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 21:33:56 +0000 Subject: [PATCH 87/92] test: tensorboard logger spec --- pysr/logger_specs.py | 2 +- pysr/test/test_main.py | 47 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/pysr/logger_specs.py b/pysr/logger_specs.py index 008999eb4..070a243a0 100644 --- a/pysr/logger_specs.py +++ b/pysr/logger_specs.py @@ -36,7 +36,7 @@ class TensorBoardLoggerSpec(AbstractLoggerSpec): """ log_dir: str = "logs/run" - log_interval: int = 10 + log_interval: int = 1 overwrite: bool = False def create_logger(self) -> AnyValue: diff --git a/pysr/test/test_main.py b/pysr/test/test_main.py index dd1236dd9..e84fd992e 100644 --- a/pysr/test/test_main.py +++ b/pysr/test/test_main.py @@ -16,6 +16,7 @@ ParametricExpressionSpec, PySRRegressor, TemplateExpressionSpec, + TensorBoardLoggerSpec, install, jl, load_all_packages, @@ -631,6 +632,52 @@ def test_parametric_expression(self): with self.assertRaises(ValueError): model.latex_table() + def test_tensorboard_logger(self): + """Test TensorBoard logger functionality.""" + try: + from tensorboard.backend.event_processing.event_accumulator import ( + EventAccumulator, + ) + except ImportError: + self.skipTest("TensorBoard not installed. Skipping test.") + + y = self.X[:, 0] + with tempfile.TemporaryDirectory() as tmpdir: + logger_spec = TensorBoardLoggerSpec( + log_dir=tmpdir, log_interval=2, overwrite=True + ) + model = PySRRegressor( + **self.default_test_kwargs, + logger_spec=logger_spec, + early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 1", + ) + model.fit(self.X, y) + + # Verify log directory exists and contains TensorBoard files + log_dir = Path(tmpdir) + assert log_dir.exists() + files = list(log_dir.glob("events.out.tfevents.*")) + assert len(files) == 1 + + # Load and verify TensorBoard events + event_acc = EventAccumulator(str(log_dir)) + event_acc.Reload() + + # Check that we have the expected scalar summaries + scalars = event_acc.Tags()["scalars"] + self.assertIn("search/data/summaries/pareto_volume", scalars) + self.assertIn("search/data/summaries/min_loss", scalars) + + # Check that we have multiple events for each summary + pareto_events = event_acc.Scalars("search/data/summaries/pareto_volume") + min_loss_events = event_acc.Scalars("search/data/summaries/min_loss") + + self.assertGreater(len(pareto_events), 0) + self.assertGreater(len(min_loss_events), 0) + + # Verify model still works as expected + self.assertLessEqual(model.get_best()["loss"], 1e-4) + def manually_create_model(equations, feature_names=None): if feature_names is None: From 84cb12e584dc329bf652fe99c3778f8157331a34 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 21:45:59 +0000 Subject: [PATCH 88/92] docs: document logger specifications --- docs/_api.md | 20 ++++++++++++++++++++ docs/examples.md | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/docs/_api.md b/docs/_api.md index 864a34ad4..38b68818e 100644 --- a/docs/_api.md +++ b/docs/_api.md @@ -80,3 +80,23 @@ PARAMSKEY show_root_heading: true heading_level: 3 show_root_full_path: false + +::: pysr.AbstractExpressionSpec + options: + show_root_heading: true + heading_level: 3 + show_root_full_path: false + +## Logger Specifications + +::: pysr.TensorBoardLoggerSpec + options: + show_root_heading: true + heading_level: 3 + show_root_full_path: false + +::: pysr.AbstractLoggerSpec + options: + show_root_heading: true + heading_level: 3 + show_root_full_path: false diff --git a/docs/examples.md b/docs/examples.md index db50d1c37..8d01b8491 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -609,7 +609,42 @@ model.fit(X, y, category=category) See [Expression Specifications](expression-specs.md) for more details. -## 12. Additional features +## 12. Using TensorBoard for Logging + +You can use TensorBoard to visualize the search progress, as well as +record hyperparameters and final metrics (like `min_loss` and `pareto_volume` - the latter of which +is a performance measure of the entire Pareto front). + +```python +import numpy as np +from pysr import PySRRegressor, TensorBoardLoggerSpec + +rstate = np.random.RandomState(42) + +# Uniform dist between -3 and 3: +X = rstate.uniform(-3, 3, (1000, 2)) +y = np.exp(X[:, 0]) + X[:, 1] + +# Create a logger that writes to "logs/run*": +logger_spec = TensorBoardLoggerSpec( + log_dir="logs/run", + log_interval=10, # Log every 10 iterations +) + +model = PySRRegressor( + binary_operators=["+", "*", "-", "/"], + logger_spec=logger_spec, +) +model.fit(X, y) +``` + +You can then view the logs with: + +```bash +tensorboard --logdir logs/ +``` + +## 13. Additional features For the many other features available in PySR, please read the [Options section](options.md). From be9f9d8e459829bece12868d27c5fd7d56d1b8fe Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 21:50:13 +0000 Subject: [PATCH 89/92] chore: tweak coverage settings --- pysr/logger_specs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pysr/logger_specs.py b/pysr/logger_specs.py index 070a243a0..885412405 100644 --- a/pysr/logger_specs.py +++ b/pysr/logger_specs.py @@ -12,12 +12,12 @@ class AbstractLoggerSpec(ABC): @abstractmethod def create_logger(self) -> AnyValue: """Create a logger instance.""" - pass + pass # pragma: no cover @abstractmethod def write_hparams(self, logger: AnyValue, hparams: dict[str, Any]) -> None: """Write hyperparameters to the logger.""" - pass + pass # pragma: no cover @dataclass From c7b63a3281146ebc4537369426b2b0800abecb8a Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 22:03:51 +0000 Subject: [PATCH 90/92] docs: add to param groupings --- pysr/param_groupings.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/pysr/param_groupings.yml b/pysr/param_groupings.yml index c76dc7e93..f43c67f94 100644 --- a/pysr/param_groupings.yml +++ b/pysr/param_groupings.yml @@ -91,6 +91,7 @@ - update_verbosity - print_precision - progress + - logger_spec - Environment: - temp_equation_file - tempdir From f16f39e092f518df0a213f491177bdcb35d47560 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 22:41:59 +0000 Subject: [PATCH 91/92] test: fix mypy errors --- pysr/sr.py | 2 +- pysr/test/test_main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pysr/sr.py b/pysr/sr.py index 8d8787c66..aa9492e2a 100644 --- a/pysr/sr.py +++ b/pysr/sr.py @@ -2056,7 +2056,7 @@ def _run( verbosity=int(self.verbosity), logger=logger, ) - if logger: + if self.logger_spec is not None: self.logger_spec.write_hparams(logger, self.get_params()) self.julia_state_stream_ = jl_serialize(out) diff --git a/pysr/test/test_main.py b/pysr/test/test_main.py index e84fd992e..8a0d348f8 100644 --- a/pysr/test/test_main.py +++ b/pysr/test/test_main.py @@ -635,7 +635,7 @@ def test_parametric_expression(self): def test_tensorboard_logger(self): """Test TensorBoard logger functionality.""" try: - from tensorboard.backend.event_processing.event_accumulator import ( + from tensorboard.backend.event_processing.event_accumulator import ( # type: ignore EventAccumulator, ) except ImportError: From 0aaf3b9597380f592b2cf6af30f140661f23d0ba Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sat, 30 Nov 2024 22:46:28 +0000 Subject: [PATCH 92/92] test: skip tensorboard test on windows --- pysr/test/test_main.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pysr/test/test_main.py b/pysr/test/test_main.py index 8a0d348f8..3264383ad 100644 --- a/pysr/test/test_main.py +++ b/pysr/test/test_main.py @@ -1,6 +1,7 @@ import importlib import os import pickle as pkl +import platform import tempfile import traceback import unittest @@ -633,6 +634,10 @@ def test_parametric_expression(self): model.latex_table() def test_tensorboard_logger(self): + + if platform.system() == "Windows": + self.skipTest("Skipping test on Windows") + """Test TensorBoard logger functionality.""" try: from tensorboard.backend.event_processing.event_accumulator import ( # type: ignore