Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vision #2184

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open

Vision #2184

Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions packages/vaex-core/vaex/dataframe.py
Original file line number Diff line number Diff line change
@@ -332,6 +332,16 @@ def is_datetime(self, expression):
def is_string(self, expression):
return vaex.array_types.is_string_type(self.data_type(expression))

def is_image(self, expression):
try:
import PIL
except ModuleNotFoundError:
raise RuntimeError("Please install pillow for image support")
if self.data_type(expression) != object:
return False
value = self.dropna(column_names=[expression]).head(1)[expression].values[0]
return hasattr(value, '_repr_png_')

def is_category(self, column):
"""Returns true if column is a category."""
column = _ensure_string_from_expression(column)
@@ -3988,7 +3998,7 @@ def table_part(k1, k2, parts):
if columns_sliced is not None and j >= columns_sliced:
column_index += 1 # skip over the slice/ellipsis
value = values[name][i]
value = _format_value(value)
value = _format_value(value, value_format=format)
values_list[column_index+1][1].append(value)
# parts += ["</tr>"]
# return values_list
@@ -4011,7 +4021,10 @@ def table_part(k1, k2, parts):
values_list = dict(values_list)
# print(values_list)
import tabulate
table_text = str(tabulate.tabulate(values_list, headers="keys", tablefmt=format))
tablefmt = format
if tablefmt == "html":
tablefmt = "unsafehtml"
table_text = str(tabulate.tabulate(values_list, headers="keys", tablefmt=tablefmt))
# Tabulate 0.8.7+ escapes html :()
table_text = table_text.replace('&lt;i style=&#x27;opacity: 0.6&#x27;&gt;', "<i style='opacity: 0.6'>")
table_text = table_text.replace('&lt;/i&gt;', "</i>")
@@ -4052,7 +4065,7 @@ def table_part(k1, k2, parts):
parts += ["<td><i style='opacity: 0.6'>{:,}</i></td>".format(i + k1)]
for name in column_names:
value = data_parts[name][i]
value = _format_value(value)
value = _format_value(value, value_format=format)
parts += ["<td>%r</td>" % value]
parts += ["</tr>"]
return parts
@@ -4084,7 +4097,7 @@ def _output_css(self):
def _repr_mimebundle_(self, include=None, exclude=None, **kwargs):
# TODO: optimize, since we use the same data in both versions
# TODO: include latex version
return {'text/html':self._head_and_tail_table(format='html'), 'text/plain': self._head_and_tail_table(format='plain')}
return {'html': self._head_and_tail_table(format='html'), 'text/plain': self._head_and_tail_table(format='plain')}

def _repr_html_(self):
"""Representation for Jupyter."""
170 changes: 129 additions & 41 deletions packages/vaex-core/vaex/expression.py

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions packages/vaex-core/vaex/expresso.py
Original file line number Diff line number Diff line change
@@ -127,6 +127,9 @@ def validate_expression(expr, variable_set, function_set=[], names=None):
validate_expression(expr.value, variable_set, function_set, names)
elif isinstance(expr, ast_Constant):
pass # like True and False
elif isinstance(expr, _ast.Tuple):
for el in expr.elts:
validate_expression(el, variable_set, function_set, names)
elif isinstance(expr, _ast.List):
for el in expr.elts:
validate_expression(el, variable_set, function_set, names)
@@ -381,6 +384,9 @@ def visit_Str(self, node):
def visit_List(self, node):
return "[{}]".format(", ".join([self.visit(k) for k in node.elts]))

def visit_Tuple(self, node):
return "({})".format(" ".join([self.visit(k) + "," for k in node.elts]))

def pow(self, left, right):
return "({left} ** {right})".format(left=left, right=right)

29 changes: 24 additions & 5 deletions packages/vaex-core/vaex/formatting.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from base64 import b64encode

import numpy as np
import numbers
import six
@@ -6,14 +8,30 @@
from vaex import datatype, struct

MAX_LENGTH = 50
IMAGE_WIDTH = 100
IMAGE_HEIGHT = 100



def _trim_string(value):
if len(value) > MAX_LENGTH:
value = repr(value[:MAX_LENGTH-3])[:-1] + '...'
value = repr(value[:MAX_LENGTH - 3])[:-1] + '...'
return value

def _format_value(value):

def _format_value(value, value_format='plain'):
if value_format == "html":
if hasattr(value, '_repr_png_'):
data = value._repr_png_()
base64_data = b64encode(data)
data_encoded = base64_data.decode('ascii')
url_data = f"data:image/png;base64,{data_encoded}"
plain = f'<img src="{url_data}" width="{IMAGE_WIDTH}" height="{IMAGE_HEIGHT}"></img>'
return plain
elif hasattr(value, 'shape') and len(value.shape) > 1:
return _trim_string(str(value).replace('\n', '<br>'))


# print("value = ", value, type(value), isinstance(value, numbers.Number))
if isinstance(value, pa.lib.Scalar):
if datatype.DataType(value.type).is_struct:
@@ -44,16 +62,17 @@ def _format_value(value):
tmp = datetime.timedelta(seconds=value / np.timedelta64(1, 's'))
ms = tmp.microseconds
s = np.mod(tmp.seconds, 60)
m = np.mod(tmp.seconds//60, 60)
m = np.mod(tmp.seconds // 60, 60)
h = tmp.seconds // 3600
d = tmp.days
if ms:
value = str('%i days %+02i:%02i:%02i.%i' % (d,h,m,s,ms))
value = str('%i days %+02i:%02i:%02i.%i' % (d, h, m, s, ms))
else:
value = str('%i days %+02i:%02i:%02i' % (d,h,m,s))
value = str('%i days %+02i:%02i:%02i' % (d, h, m, s))
return value
elif isinstance(value, numbers.Number):
value = str(value)

else:
value = repr(value)
value = _trim_string(value)
3 changes: 2 additions & 1 deletion packages/vaex-core/vaex/registry.py
Original file line number Diff line number Diff line change
@@ -11,7 +11,8 @@
'str_pandas': vaex.expression.StringOperationsPandas,
'dt': vaex.expression.DateTime,
'td': vaex.expression.TimeDelta,
'struct': vaex.expression.StructOperations
'struct': vaex.expression.StructOperations,
'vision': vaex.expression.Image
}


183 changes: 183 additions & 0 deletions packages/vaex-core/vaex/vision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
__author__ = 'yonatanalexander'

import glob
import os
import pathlib
import functools
import numpy as np
import warnings
import io
import vaex
import vaex.utils

try:
import PIL
import base64
except:
PIL = vaex.utils.optional_import("PIL.Image", modules="pillow")


def get_paths(path, suffix=None):
if isinstance(path, list):
return functools.reduce(lambda a, b: get_paths(a, suffix=suffix) + get_paths(b, suffix=suffix), path)
if os.path.isfile(path):
files = [path]
elif os.path.isdir(path):
files = []
if suffix is not None:
files = [str(path) for path in pathlib.Path(path).rglob(f"*{suffix}")]
else:
for suffix in ['jpg', 'png', 'jpeg', 'ppm', 'thumbnail']:
files.extend([str(path) for path in pathlib.Path(path).rglob(f"*{suffix}")])
elif isinstance(path, str) and len(glob.glob(path)) > 0:
return glob.glob(path)
else:
raise ValueError(
f"path: {path} do not point to an image, a directory of images, or a nested directory of images, or a glob path of files")
# TODO validate the files without opening it
return files


def _safe_apply(f, image_array):
try:
return f(image_array)
except Exception as e:
return None


def _infer(item):
if hasattr(item, 'as_py'):
item = item.as_py()
if isinstance(item, np.ndarray):
decode = numpy_2_pil
elif isinstance(item, int):
item = np.ndarray(item)
decode = numpy_2_pil
elif isinstance(item, bytes):
decode = bytes_2_pil
elif isinstance(item, str):
if os.path.isfile(item):
decode = PIL.Image.open
else:
decode = str_2_pil
else:
raise RuntimeError(f"Can't handle item {item}")
return _safe_apply(decode, item)


@vaex.register_function(scope='vision')
def infer(images):
images = [_infer(image) for image in images]
return np.array(images, dtype="O")


@vaex.register_function(scope='vision')
def open(path, suffix=None):
files = get_paths(path=path, suffix=suffix)
df = vaex.from_arrays(path=files)
df['image'] = df['path'].vision.infer()
return df


@vaex.register_function(scope='vision')
def filename(images):
images = [image.filename if hasattr(image, 'filename') else None for image in images]
return np.array(images, dtype="O")


@vaex.register_function(scope='vision')
def resize(images, size, resample=3, **kwargs):
images = [image.resize(size, resample=resample, **kwargs) for image in images]
return np.array(images, dtype="O")


@vaex.register_function(scope='vision')
def to_numpy(images):
images = [pil_2_numpy(image) for image in images]
return np.array(images, dtype="O")


@vaex.register_function(scope='vision')
def to_bytes(arrays, format='png'):
images = [pil_2_bytes(image_array, format=format) for image_array in arrays]
return np.array(images, dtype="O")


@vaex.register_function(scope='vision')
def to_str(arrays, format='png', encoding=None):
images = [pil_2_str(image_array, format=format, encoding=encoding) for image_array in arrays]
return np.array(images, dtype="O")


@vaex.register_function(scope='vision')
def from_numpy(arrays):
images = [_safe_apply(numpy_2_pil, image_array) for image_array in arrays]
return np.array(images, dtype="O")


@vaex.register_function(scope='vision')
def from_bytes(arrays):
images = [_safe_apply(bytes_2_pil, image_array) for image_array in arrays]
return np.array(images, dtype="O")


@vaex.register_function(scope='vision')
def from_str(arrays):
images = [_safe_apply(str_2_pil, image_array) for image_array in arrays]
return np.array(images, dtype="O")


@vaex.register_function(scope='vision')
def from_path(arrays):
images = [_safe_apply(PIL.Image.open, image_array) for image_array in vaex.array_types.tolist(arrays)]
return np.array(images, dtype="O")


def rgba_2_pil(rgba):
# TODO remove?
with warnings.catch_warnings():
warnings.simplefilter("ignore")
im = PIL.Image.fromarray(rgba[::-1], "RGBA") # , "RGBA", 0, -1)
return im


def numpy_2_pil(array):
return PIL.Image.fromarray(np.uint8(array))


def pil_2_numpy(im):
if im is not None:
return np.array(im).astype(object)
return None


def pil_2_bytes(im, format="png"):
f = io.BytesIO()
im.save(f, format)
return base64.b64encode(f.getvalue())


def bytes_2_pil(b):
return PIL.Image.open(io.BytesIO(base64.b64decode(b)))


def pil_2_str(im, format="png", encoding=None):
args = [encoding] if encoding else []
return pil_2_bytes(im, format=format).decode(*args)


def str_2_pil(im, encoding=None):
args = [encoding] if encoding else []
return bytes_2_pil(im.encode(*args))


def rgba_to_url(rgba):
bit8 = rgba.dtype == np.uint8
if not bit8:
rgba = (rgba * 255.).astype(np.uint8)
im = rgba_2_pil(rgba)
data = pil_2_bytes(im)
data = base64.b64encode(data)
data = data.decode("ascii")
imgurl = "data:image/png;base64," + data + ""
return imgurl
Binary file added tests/data/images/cats/cat.4865.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/images/cats/cat.9021.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/images/dogs/dog.2423.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/images/dogs/dog.8091.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
39 changes: 39 additions & 0 deletions tests/ml/vision_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import vaex.vision
import PIL

basedir = 'tests/data/images'


def test_vision_conversions():
df = vaex.vision.open(basedir)
df['image_bytes'] = df['image'].vision.to_bytes()
df['image_str'] = df['image'].vision.to_str()
df['image_array'] = df['image'].vision.resize((10, 10)).vision.to_numpy()

assert isinstance(df['image_bytes'].vision.from_bytes().values[0], PIL.Image.Image)
assert isinstance(df['image_str'].vision.from_str().values[0], PIL.Image.Image)
assert isinstance(df['image_array'].vision.from_numpy().values[0], PIL.Image.Image)

assert isinstance(df['image_bytes'].vision.infer().values[0], PIL.Image.Image)
assert isinstance(df['image_str'].vision.infer().values[0], PIL.Image.Image)
assert isinstance(df['image_array'].vision.infer().values[0], PIL.Image.Image)
assert isinstance(df['path'].vision.infer().values[0], PIL.Image.Image)


def test_vision_open():
df = vaex.vision.open(basedir)
assert df.shape == (4, 2)
assert vaex.vision.open(basedir + '/dogs').shape == (2, 2)
assert vaex.vision.open(basedir + '/dogs/dog*').shape == (2, 2)
assert vaex.vision.open(basedir + '/dogs/dog.2423.jpg').shape == (1, 2)
assert vaex.vision.open([basedir + '/dogs/dog.2423.jpg', basedir + '/cats/cat.4865.jpg']).shape == (2, 2)
assert 'path' in df
assert 'image' in df


def test_vision():
df = vaex.vision.open(basedir)
assert df.shape == (4, 2)
assert isinstance(df.image.tolist()[0], PIL.Image.Image)
assert df.image.vision.to_numpy().shape == (4, 261, 350, 3)
assert df.image.vision.resize((8, 4)).vision.to_numpy().shape == (4, 4, 8, 3)