Description
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
(optional) I have confirmed this bug exists on the master branch of pandas.
Code Sample
import pandas as pd
import numpy as np
pd.DataFrame(range(10**1), dtype=np.int64)=='' # OK, just a FutureWarning
pd.DataFrame(range(10**2), dtype=np.int64)=='' # OK, just a FutureWarning
pd.DataFrame(range(10**3), dtype=np.int64)=='' # OK, just a FutureWarning
pd.DataFrame(range(10**4), dtype=np.int64)=='' # OK, just a FutureWarning
# pd.DataFrame(range(10**5), dtype=np.int64)=='' # ValueError: unknown type str32
# pd.DataFrame(range(10**5), dtype=np.int64)==' ' # ValueError: unknown type str64
pd.DataFrame(range(10**5), dtype=np.int64)==' ' # ValueError: unknown type str128
/home/ubuntu/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/pandas/core/computation/expressions.py:68: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
return op(a, b)
/home/ubuntu/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/pandas/core/computation/expressions.py:68: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
return op(a, b)
/home/ubuntu/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/pandas/core/computation/expressions.py:68: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
return op(a, b)
/home/ubuntu/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/pandas/core/computation/expressions.py:68: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
return op(a, b)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-104-1641071c93d2> in <module>
5 # pd.DataFrame(range(10**5), dtype=np.int64)=='' # ValueError: unknown type str32
6 # pd.DataFrame(range(10**5), dtype=np.int64)==' ' # ValueError: unknown type str64
----> 7 pd.DataFrame(range(10**5), dtype=np.int64)==' ' # ValueError: unknown type str128
~/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/pandas/core/ops/__init__.py in f(self, other)
702
703 # See GH#4537 for discussion of scalar op behavior
--> 704 new_data = dispatch_to_series(self, other, op, axis=axis)
705 return self._construct_result(new_data)
706
~/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/pandas/core/ops/__init__.py in dispatch_to_series(left, right, func, axis)
263 if not is_list_like(right):
264 # i.e. scalar, faster than checking np.ndim(right) == 0
--> 265 bm = left._mgr.apply(array_op, right=right)
266 return type(left)(bm)
267
~/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, **kwargs)
402
403 if callable(f):
--> 404 applied = b.apply(f, **kwargs)
405 else:
406 applied = getattr(b, f)(**kwargs)
~/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/pandas/core/internals/blocks.py in apply(self, func, **kwargs)
343 """
344 with np.errstate(all="ignore"):
--> 345 result = func(self.values, **kwargs)
346
347 return self._split_op_result(result)
~/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/pandas/core/ops/array_ops.py in comparison_op(left, right, op)
244 warnings.simplefilter("ignore", DeprecationWarning)
245 with np.errstate(all="ignore"):
--> 246 res_values = na_arithmetic_op(lvalues, rvalues, op, is_cmp=True)
247
248 return res_values
~/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/pandas/core/ops/array_ops.py in na_arithmetic_op(left, right, op, is_cmp)
140
141 try:
--> 142 result = expressions.evaluate(op, left, right)
143 except TypeError:
144 if is_cmp:
~/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/pandas/core/computation/expressions.py in evaluate(op, a, b, use_numexpr)
228 use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b)
229 if use_numexpr:
--> 230 return _evaluate(op, op_str, a, b) # type: ignore
231 return _evaluate_standard(op, op_str, a, b)
232
~/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/pandas/core/computation/expressions.py in _evaluate_numexpr(op, op_str, a, b)
110 f"a_value {op_str} b_value",
111 local_dict={"a_value": a_value, "b_value": b_value},
--> 112 casting="safe",
113 )
114
~/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/numexpr/necompiler.py in evaluate(ex, local_dict, global_dict, out, order, casting, **kwargs)
820 # Create a signature
821 signature = [(name, getType(arg)) for (name, arg) in
--> 822 zip(names, arguments)]
823
824 # Look up numexpr if possible.
~/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/numexpr/necompiler.py in <listcomp>(.0)
819
820 # Create a signature
--> 821 signature = [(name, getType(arg)) for (name, arg) in
822 zip(names, arguments)]
823
~/miniconda3/envs/genomics.py3/lib/python3.7/site-packages/numexpr/necompiler.py in getType(a)
701 if kind == 'S':
702 return bytes
--> 703 raise ValueError("unknown type %s" % a.dtype.name)
704
705
ValueError: unknown type str128
Problem description
There is an error with an unclear message raised in unpredictable conditions (has to do with dataframe size). Either an informative error message should be raised (the error message itself seems buggy and somehow converts the string size to a supposed data type)
Expected Output
Output of pd.show_versions()
INSTALLED VERSIONS
commit : 2a7d332
python : 3.7.3.final.0
python-bits : 64
OS : Linux
OS-release : 5.4.0-1028-aws
Version : #29~18.04.1-Ubuntu SMP Tue Oct 6 17:14:23 UTC 2020
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : C.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.1.2
numpy : 1.19.1
pytz : 2019.1
dateutil : 2.8.0
pip : 19.1.1
setuptools : 41.0.1
Cython : None
pytest : None
hypothesis : None
sphinx : 2.1.2
blosc : None
feather : None
xlsxwriter : None
lxml.etree : 4.3.4
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.10.1
IPython : 7.8.0
pandas_datareader: None
bs4 : None
bottleneck : None
fsspec : 0.8.4
fastparquet : 0.4.1
gcsfs : None
matplotlib : 3.1.0
numexpr : 2.6.9
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
pyxlsb : None
s3fs : None
scipy : 1.5.2
sqlalchemy : 1.3.9
tables : 3.5.2
tabulate : None
xarray : 0.14.1
xlrd : 1.2.0
xlwt : None
numba : 0.48.0