-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix dephell and CI hell #3
Conversation
410c757
to
db0dd6c
Compare
cc @maartenbreddels I think we're almost there, only a few things left: new pandas brings some issues: def test_df_project():
x = np.arange(10, dtype='i4')
y = x**2
df = vaex.from_arrays(x=x, y=y, z1=x+y, z2=x-y)
# projecting 2 columns will drop 2 columns, which could be done in different order
df_a = df[['x', 'y']]
df_b = df[['x', 'y']]
assert df_a.fingerprint() == df_b.fingerprint()
> assert df_a.fingerprint() == 'dataframe-c13a4ab588272f03855ae5627731f7e5'
E AssertionError: assert 'dataframe-d4...9ff888ba16e7c' == 'dataframe-c1...ae5627731f7e5'
E
E - dataframe-c13a4ab588272f03855ae5627731f7e5
E + dataframe-d4565ca8187231a051a9ff888ba16e7c and @pytest.mark.skipif(sys.version_info[:2] < (3, 7), reason="Python 36+Pandas has no Float64Dtype")
def test_from_pandas():
dd_dict = {
'boolean': [True, True, False, None, True],
'text': ['This', 'is', 'some', 'text', 'so...'],
'text_missing': pd.Series(['Some', 'parts', None, 'missing', None], dtype='string'),
'float': [1, 30, -2, 1.5, 0.000],
'float_missing': [1, None, -2, 1.5, 0.000],
'float_missing_masked': pd.Series([1, None, -2, 1.5, 0.000], dtype=pd.Float64Dtype()),
'int_missing': pd.Series([1, None, 5, 1, 10], dtype='Int64'),
'datetime_1': [pd.NaT, datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1)],
'datetime_2': [pd.NaT, None, pd.NaT, pd.NaT, pd.NaT],
> 'datetime_3': [pd.Timedelta('1M'), pd.Timedelta('1D'), pd.Timedelta('100M'), pd.Timedelta('2D'), pd.Timedelta('1H')],
'datetime_4': [pd.Timestamp('2001-1-1 2:2:11'), pd.Timestamp('2001-12'), pd.Timestamp('2001-10-1'), pd.Timestamp('2001-03-1 2:2:11'), pd.Timestamp('2001-1-1 2:2:11')],
'datetime_5': [datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1)],
'datetime_6': [datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1)],
}
tests/from_pandas_test.py:23:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
timedeltas.pyx:1844: in pandas._libs.tslibs.timedeltas.Timedelta.__new__
???
timedeltas.pyx:644: in pandas._libs.tslibs.timedeltas.parse_timedelta_string
???
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> ???
E ValueError: Units 'M', 'Y' and 'y' do not represent unambiguous timedelta values and are not supported. and x = array(['2009-10-12T03:31:00.000000000'], dtype='datetime64[ns]')
@register_function(scope='dt', as_property=True)
def dt_weekofyear(x):
"""Returns the week ordinal of the year.
:returns: an expression containing the week ordinal of the year, extracted from a datetime column.
Example:
>>> import vaex
>>> import numpy as np
>>> date = np.array(['2009-10-12T03:31:00', '2016-02-11T10:17:34', '2015-11-12T11:34:22'], dtype=np.datetime64)
>>> df = vaex.from_arrays(date=date)
>>> df
# date
0 2009-10-12 03:31:00
1 2016-02-11 10:17:34
2 2015-11-12 11:34:22
>>> df.date.dt.weekofyear
Expression = dt_weekofyear(date)
Length: 3 dtype: int64 (expression)
-----------------------------------
0 42
1 6
2 46
"""
> return _to_pandas_series(x).dt.weekofyear.values
E AttributeError: 'DatetimeProperties' object has no attribute 'weekofyear' and it looks like new graphql version (needed for more recent python versions) has an issue with the query syntax: def test_aggregates(df, schema):
result = schema.execute("""
{
df {
count
min {
x
y
}
mean {
x
y
}
max {
x
y
}
}
}
""")
> assert not result.errors
E assert not [GraphQLError("Aggregation.__init__() got an unexpected keyword argument 'df'", locations=[SourceLocation(line=3, column=9)], path=['df'])]
E + where [GraphQLError("Aggregation.__init__() got an unexpected keyword argument 'df'", locations=[SourceLocation(line=3, column=9)], path=['df'])] = ExecutionResult(data={'df': None}, errors=[GraphQLError("Aggregation.__init__() got an unexpected keyword argument 'df'", locations=[SourceLocation(line=3, column=9)], path=['df'])]).errors also something wrong with csr: def test_sparse_basics():
df = vaex.from_arrays(x=x)
df.add_columns(['s1', 's2'], s)
> assert df.s1.tolist() == [0, 1, 2]
tests/sparse_test.py:12:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/vaex/expression.py:808: in tolist
values = self.evaluate(i1=i1, i2=i2)
/opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/vaex/expression.py:1086: in evaluate
return self.ds.evaluate(self, i1, i2, out=out, selection=selection, array_type=array_type, parallel=parallel)
/opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/vaex/dataframe.py:3095: in evaluate
return self._evaluate_implementation(expression, i1=i1, i2=i2, out=out, selection=selection, filtered=filtered, array_type=array_type, parallel=parallel, chunk_size=chunk_size, progress=progress)
/opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/vaex/dataframe.py:6481: in _evaluate_implementation
arrays[expression] = arrays[expression][0:end-start] # materialize fancy columns (lazy, indexed)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <vaex.column.ColumnSparse object at 0x7efd744a73b0>
slice = slice(0, 3, None)
def __getitem__(self, slice):
# not sure if this is the fastest
> return self.matrix[slice, self.column_index].A[:,0]
E AttributeError: 'csr_matrix' object has no attribute 'A' also tensorflow wants an extension now: ______________ test_keras_model_classification[df_factory_numpy] _______________
tmpdir = local('/tmp/pytest-of-runner/pytest-0/test_keras_model_classificatio0')
df_iris = # sepal_length sepal_width petal_length petal_width class_
0 5.9 3.0 4.2 ... 3.8 1.7 0.3 0
149 6.2 2.9 4.3 1.3 1
def test_keras_model_classification(tmpdir, df_iris):
df = df_iris
copy = df.copy()
features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width']
target = 'class_'
# Preprocessing
df = df.ml.minmax_scaler(features)
df = df.ml.one_hot_encoder(features=[target])
features = df.get_column_names(regex='^minmax')
targets = df.get_column_names(regex='^class__')
# Create the neural netwrok model
nn_model = K.Sequential()
nn_model.add(K.layers.Dense(4, activation='tanh'))
nn_model.add(K.layers.Dense(3, activation='softmax'))
nn_model.compile(optimizer=K.optimizers.RMSprop(learning_rate=0.01),
loss=K.losses.categorical_crossentropy,
metrics=['accuracy'])
X = df[features].values
y = df[targets].values
nn_model.fit(x=X, y=y, validation_split=0.05, epochs=11, verbose=0)
keras_model = vaex.ml.tensorflow.KerasModel(features=features, prediction_name='pred', model=nn_model)
df_trans = keras_model.transform(df)
assert df_trans.pred.shape == (150, 3)
state_path = str(tmpdir.join('state.json'))
> df_trans.state_write(state_path)
tests/ml/tensorflow_test.py:47:
...
> raise ValueError(
"Invalid filepath extension for saving. "
"Please add either a `.keras` extension for the native Keras "
f"format (recommended) or a `.h5` extension. "
"Use `model.export(filepath)` if you want to export a SavedModel "
"for use with TFLite/TFServing/etc. "
f"Received: filepath={filepath}."
)
E ValueError: Invalid filepath extension for saving. Please add either a `.keras` extension for the native Keras format (recommended) or a `.h5` extension. Use `model.export(filepath)` if you want to export a SavedModel for use with TFLite/TFServing/etc. Received: filepath=/tmp/tmpcu2kp0mw. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Awesome cleanup, great that you used uv!
Q: Why did we have to rename vaex-meta to meta?
'annoy', | ||
'scikit-learn', | ||
'xgboost', | ||
'lightgbm~=4.0', | ||
'catboost', |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these should be optional right? put them under all?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
which ones exactly should go to all
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
all these, since they were not a dependency before (more like a peer dependency - in npm speak)
], | ||
packages=[], | ||
install_requires=[ | ||
'pip', | ||
f"{package}[all] @ {(Path(__file__).parent / 'packages' / package).as_uri()}" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
are you saying this works for an editable install? (awesome if yes!)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nope, this doesn't work for -e
I'm afraid.
however, I think you could do pip install -e packages/*
if you wanted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In solara we do:
https://github.com/widgetti/solara/blob/master/requirements-dev.txt
But if we do this, we have to maybe update https://vaex.io/docs/installing.html
We can simply update the keys, that is expected.
happy to disable that test and leave it up to a community member to fix it. The rest of the errors would could solve by pinning in the |
@maartenbreddels see PR description, second bullet punt last sub-bullet point
that is my hope too, although python 3.11 / 3.12 will need certain fresher libraries than were used before: e.g. graphql v3. |
packages/vaex-core/setup.py
Outdated
"nest-asyncio>=1.3.3", "pyarrow>=5.0.0", "frozendict!=2.2.0", | ||
setup_requires = ["numpy~=1.17"] | ||
install_requires_core = ["numpy~=1.17", "aplus", "tabulate>=0.8.3", "dask!=2022.4.0", | ||
"future>=0.15.2", "pyyaml", "six", "cloudpickle", "pandas", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pandas python 3.12 wheels were introduced in pandas v2.1.1, so that's the implicit minimum version CI needs to support. e.g. the error with 'M'
as period was already changed in pandas v1.0. So I think vaex CI was construed with pandas v0.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hmm ^ new commit unrelated to build, now there are build errors that weren't there last week 🤯 |
How did I miss that? 🤯 |
this clang boost compile error fixed by downgrading to |
|
@maartenbreddels ref #3 (comment) I'm going to need your support with: |
We can do the fingerprint test like
The matrix issue can be fixed with (I cannot push to your branch): diff --git a/packages/vaex-core/vaex/column.py b/packages/vaex-core/vaex/column.py
index 40de5aab..87938ce3 100644
--- a/packages/vaex-core/vaex/column.py
+++ b/packages/vaex-core/vaex/column.py
@@ -163,7 +163,10 @@ class ColumnSparse(Column):
def __getitem__(self, slice):
# not sure if this is the fastest
- return self.matrix[slice, self.column_index].A[:,0]
+ mat = self.matrix[slice, self.column_index]
+ dense = mat.todense()
+ column_part = np.asarray(dense)[:,0]
+ return column_part
class ColumnNumpyLike(Column): We can drop support for graphql for older python versions, so we skip that test? |
the test does |
regarding the graphql v3 requirement, do you suggest to leave the test broken? maybe its just a simple syntax update to the query (just need to figure out which) |
I'll merge this into the main PR so you can take over again if that's OK 🙏 |
* fix[ml]: adjust tests to reflect latest apis of 3rd party libraries (xgboost, lightgbm) * Build wheels on pull_request * Maximize wheel build parallellization * git submodule update --remote --merge packages/vaex-core/vendor/pybind11 * Fix gcc error * Fix workflow syntax * Remove redefinition https://github.com/pybind/pybind11/blob/769fd3b889fef6cddb060f2a0be26aee62b4da05/include/pybind11/pytypes.h#L859 https://github.com/ddelange/vaex/actions/runs/3965609112/jobs/6795506653#step:6:2110 * Disable win32 https://github.com/ddelange/vaex/actions/runs/3965689146/jobs/6795667118#step:6:538 * Remove testing leftovers * Add upper cap on lightgbm microsoft/LightGBM#5196 (comment) * Migrate to mamba-org/setup-micromamba@v1 * Upload release assets on Github (pre)release * Add missing permission for release assets * Build cp312 wheels * Remove setuptools and wheel from pyproject.toml * Replace imp with importlib * chore: trigger ci * ci: upgrade xcode for brew install libomp * update requirements-ml to comply with the latest veex-ml expectations. * try to install lightgbm via pip * fix: only the mini taxi file is on s3 for cost savings * ci: pin dask<2024.2.0 to get the same hash keys This version gives different results, although not a problem in production (it will make your cache invalid though), for CI we test that we have stable keys (fingerprints) * fix: only the mini taxi file is on s3 for cost savings (2) * ci: skip notebooks that depend on the dataframe server * ci: skip ci steps in various old python versions * Bump micromamba and other actions * ci: specific test hangs on ci+osx * Bump cibuildwheel, use native arm64 mac runners * test: log is renamed to log loss * test: skip lightgbm tests on 36 and 37 * test: skip sklearn tests on 36 and 37 * Update packages/vaex-core/setup.py Co-authored-by: ddelange <14880945+ddelange@users.noreply.github.com> * chore: drop python 3.6 and 3.7 support Co-authored-by: Ewout ter Hoeven <E.M.terHoeven@student.tudelft.nl> * test: skip a failing test for windows * ci: macOS 11 is retired as of June 28 * ci: always build wheels, but do not publish instead * ci: try with older micromamba, 1.5.6 seems to sometimes hang * Install setuptools in wheel.yml * Add sudo * Fix windows * Use sudo only on macos * Fix empty string evaluating false * Add .readthedocs.yaml * Pull submodules * Try editable rtd install * Try move editable install to requirements_rtd.txt * Allow newer sphinx * Autocancel previous runs in PRs * Autocancel all jobs * Sphinx sidebar fix https://github.com/dmwyatt/sphinx-book-theme/blob/2416ef6cebc3d83019dbc83f25c61b72719cad55/docs/customize/sidebar-primary.md#default-sidebar-elements * Remove autocancel, doesn't work from forks * Add cancel-in-progress keys * Amend to only cancel on PRs * Add CIBW_TEST_COMMAND * Disallow numpy v2 * Skip musllinux due to misding blake3 wheels * Install carco for blake3 source install * Fix CIBW_ENVIRONMENT * Add CIBW_TEST_SKIP * Build wheels with numpy v1 * run tests on 3.10-3.12 * upgrade micromamba * unpin pytest-asyncio * try different pin as <2.0 because mamba crashes on that * Fix dephell and CI hell (ddelange#3) * allow for multiple fingerprints related to package versions * explicit dtype=object * fix pandas issues * revert graphene upgrade * force uv to ignore Werkzeug (we do not use it, its a subdependency which has conflicts) * fix: work with modern scipy/sparse arrays * skip test for python 3.8 * skip test on windows * add missing import * skip some tests conditionally * tuple comparison fix * does osx run without sourcing .bash profile? * skip a few more tests * only run test on linux * Try building arm64 wheels on macos-13 * Switch to older Xcode * Fix readthedocs by manually unpinning werkzeug in the graphene-tornado v2.6.1 release # for now we leave graphene-tornado / graphene on v2 # ref #2356 (comment) # in top level vaex[ci] we added the following (additional) direct reference requirement which adds one commit (ddelange/graphene-tornado@d75f01f) on top of the 2.6.1 release to unpin werkzeug # "graphene-tornado @ https://github.com/ddelange/graphene-tornado/archive/refs/heads/2.6.1.unpin-werkzeug.zip", * Simplify * Revert "Simplify" This reverts commit 91263bc. * Add back tags event trigger for core wheel upload * Switch to trusted publishing * Add trusted publishing for vaex-core * Exclude windows 3.8 from pythonpackage.yml * Don't upload vaex-core source distribution * Disable cp38-win vaex-core wheel build * Add permission --------- Co-authored-by: Jovan Veljanoski <jovan.veljanoski@tiqets.com> Co-authored-by: Maarten A. Breddels <maartenbreddels@gmail.com> Co-authored-by: Ewout ter Hoeven <E.M.terHoeven@student.tudelft.nl>
going into vaexio#2331
fix some dephell and related CI hell by getting rid of conda and using
install_requires
andextras_require
exclusivelysetup.py
name
was/is vaex-metaname
was/is vaexvaex @ file://.../vaex-meta
is not allowed)setup.py
, either toinstall_requires
or to[all]
extrasetyp-python
in CIuv pip install .[ci]
in CIpip install .
now also workspip install https://github.com/vaexio/vaex/archive/refs/heads/master.zip
now also workssetup.py
, replace withfile://
specifiersmisc/conda
folder containing outdated dependency derivatives (potential source of confusion)