Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] IO - Change origin attribute when not find on system #6555

Merged
merged 7 commits into from
Oct 6, 2023
14 changes: 5 additions & 9 deletions Orange/data/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import xlsxwriter
import openpyxl

from Orange.data import _io, Table, Domain, ContinuousVariable
from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin
from Orange.data import Compression, open_compressed, detect_encoding, \
isnastr, guess_data_type, sanitize_variable
from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL
Expand Down Expand Up @@ -164,21 +164,15 @@ def read(self):
skipinitialspace=True,
)
data = self.data_table(reader)

# TODO: Name can be set unconditionally when/if
# self.filename will always be a string with the file name.
# Currently, some tests pass StringIO instead of
# the file name to a reader.
if isinstance(self.filename, str):
data.name = path.splitext(
path.split(self.filename)[-1])[0]
data.name = path.splitext(path.split(self.filename)[-1])[0]
if error and isinstance(error, UnicodeDecodeError):
pos, endpos = error.args[2], error.args[3]
warning = ('Skipped invalid byte(s) in position '
'{}{}').format(pos,
('-' + str(endpos)) if (endpos - pos) > 1 else '')
warnings.warn(warning)
self.set_table_metadata(self.filename, data)
update_origin(data, self.filename)
return data
except Exception as e:
error = e
Expand Down Expand Up @@ -215,6 +209,7 @@ def read(self):
if not isinstance(table, Table):
raise TypeError("file does not contain a data table")
else:
update_origin(table, self.filename)
return table

@classmethod
Expand Down Expand Up @@ -264,6 +259,7 @@ def read(self):
try:
cells = self.get_cells()
table = self.data_table(cells)
update_origin(table, self.filename)
table.name = path.splitext(path.split(self.filename)[-1])[0]
if self.sheet and len(self.sheets) > 1:
table.name = '-'.join((table.name, self.sheet))
Expand Down
82 changes: 79 additions & 3 deletions Orange/data/io_util.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
import os.path
import subprocess
from collections import defaultdict
from typing import Tuple, Optional

import numpy as np
import pandas as pd
from chardet.universaldetector import UniversalDetector

from Orange.data import (
is_discrete_values, MISSING_VALUES, Variable,
DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable,
DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable, Table,
)
from Orange.misc.collections import natural_sorted

__all__ = ["Compression", "open_compressed", "detect_encoding", "isnastr",
"guess_data_type", "sanitize_variable"]
__all__ = [
"Compression",
"open_compressed",
"detect_encoding",
"isnastr",
"guess_data_type",
"sanitize_variable",
"update_origin",
]


class Compression:
Expand Down Expand Up @@ -207,3 +217,69 @@ def mapvalues(arr):
values = [_var.parse(i) for i in orig_values]

return values, var


def _extract_new_origin(attr: Variable, table: Table, lookup_dirs: Tuple[str]) -> Optional[str]:
# origin exists
if os.path.exists(attr.attributes["origin"]):
return attr.attributes["origin"]

# last dir of origin in lookup dirs
dir_ = os.path.basename(os.path.normpath(attr.attributes["origin"]))
for ld in lookup_dirs:
new_dir = os.path.join(ld, dir_)
if os.path.isdir(new_dir):
return new_dir

# all column paths in lookup dirs
for ld in lookup_dirs:
if all(
os.path.exists(os.path.join(ld, attr.str_val(v)))
for v in table.get_column(attr)
if v and not pd.isna(v)
):
return ld

return None


def update_origin(table: Table, file_path: str):
"""
When a dataset with file paths in the column is moved to another computer,
the absolute path may not be correct. This function updates the path for all
columns with an "origin" attribute.

The process consists of two steps. First, we identify directories to search
for files, and in the second step, we check if paths exist.

Lookup directories:
1. The directory where the file from file_path is placed
2. The parent directory of 1. The situation when the user places dataset
file in the directory with files (for example, workflow in a directory
with images)

Possible situations for file search:
1. The last directory of origin (basedir) is in one of the lookup directories
2. Origin doesn't exist in any lookup directories, but paths in a column can
be found in one of the lookup directories. This is usually a situation
when paths in a column are complex (e.g. a/b/c/d/file.txt).

Note: This function updates the existing table

Parameters
----------
table
Orange Table to be updated if origin exits in any column
file_path
Path of the loaded dataset for reference. Only paths inside datasets
directory or its parent directory will be considered for new origin.
"""
file_dir = os.path.dirname(file_path)
parent_dir = os.path.dirname(file_dir)
# if file_dir already root file_dir == parent_dir
lookup_dirs = tuple({file_dir: 0, parent_dir: 0})
for attr in table.domain.metas:
if "origin" in attr.attributes and (attr.is_string or attr.is_discrete):
new_orig = _extract_new_origin(attr, table, lookup_dirs)
if new_orig:
attr.attributes["origin"] = new_orig
124 changes: 123 additions & 1 deletion Orange/data/tests/test_io_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
import os.path
import unittest
from tempfile import TemporaryDirectory

from Orange.data import ContinuousVariable, guess_data_type
import numpy as np

from Orange.data import (
ContinuousVariable,
guess_data_type,
Table,
Domain,
StringVariable,
DiscreteVariable,
)
from Orange.data.io_util import update_origin


class TestIoUtil(unittest.TestCase):
Expand All @@ -10,5 +22,115 @@ def test_guess_continuous_w_nans(self):
ContinuousVariable)


class TestUpdateOrigin(unittest.TestCase):
FILE_NAMES = ["file1.txt", "file2.txt", "file3.txt"]

def setUp(self) -> None:
self.alt_dir = TemporaryDirectory() # pylint: disable=consider-using-with

self.var_string = var = StringVariable("Files")
files = self.FILE_NAMES + [var.Unknown]
self.table_string = Table.from_list(
Domain([], metas=[var]), np.array(files).reshape((-1, 1))
)
self.var_discrete = var = DiscreteVariable("Files", values=self.FILE_NAMES)
files = self.FILE_NAMES + [var.Unknown]
self.table_discrete = Table.from_list(
Domain([], metas=[var]), np.array(files).reshape((-1, 1))
)

def tearDown(self) -> None:
self.alt_dir.cleanup()

def __create_files(self):
for f in self.FILE_NAMES:
f = os.path.join(self.alt_dir.name, f)
with open(f, "w", encoding="utf8"):
pass
self.assertTrue(os.path.exists(f))

def test_origin_not_changed(self):
"""
Origin exist; keep it unchanged, even though dataset path also includes
files from column.
"""
with TemporaryDirectory() as dir_name:
self.var_string.attributes["origin"] = dir_name
update_origin(self.table_string, self.alt_dir.name)
self.assertEqual(
self.table_string.domain[self.var_string].attributes["origin"], dir_name
)

def test_origin_subdir(self):
"""
Origin is wrong but last dir in origin exit in the dataset file's path
"""
images_dir = os.path.join(self.alt_dir.name, "subdir")
os.mkdir(images_dir)

self.var_string.attributes["origin"] = "/a/b/subdir"
update_origin(self.table_string, os.path.join(self.alt_dir.name, "data.csv"))
self.assertEqual(
self.table_string.domain[self.var_string].attributes["origin"], images_dir
)

def test_origin_parents_subdir(self):
"""
Origin is wrong but last dir in origin exit in the dataset file
parent's directory
"""
# make the dir where dataset is placed
images_dir = os.path.join(self.alt_dir.name, "subdir")
os.mkdir(images_dir)

self.var_string.attributes["origin"] = "/a/b/subdir"
update_origin(self.table_string, os.path.join(images_dir, "data.csv"))
self.assertEqual(
self.table_string.domain[self.var_string].attributes["origin"], images_dir
)

def test_column_paths_subdir(self):
"""
Origin dir not exiting but paths from column exist in dataset's dir
"""
self.__create_files()

self.var_string.attributes["origin"] = "/a/b/non-exiting-dir"
update_origin(self.table_string, os.path.join(self.alt_dir.name, "data.csv"))
self.assertEqual(
self.table_string.domain[self.var_string].attributes["origin"],
self.alt_dir.name,
)

self.var_discrete.attributes["origin"] = "/a/b/non-exiting-dir"
update_origin(self.table_discrete, os.path.join(self.alt_dir.name, "data.csv"))
self.assertEqual(
self.table_discrete.domain[self.var_discrete].attributes["origin"],
self.alt_dir.name,
)

def test_column_paths_parents_subdir(self):
"""
Origin dir not exiting but paths from column exist in dataset parent's dir
"""
# make the dir where dataset is placed
dataset_dir = os.path.join(self.alt_dir.name, "subdir")
self.__create_files()

self.var_string.attributes["origin"] = "/a/b/non-exiting-dir"
update_origin(self.table_string, os.path.join(dataset_dir, "data.csv"))
self.assertEqual(
self.table_string.domain[self.var_string].attributes["origin"],
self.alt_dir.name,
)

self.var_discrete.attributes["origin"] = "/a/b/non-exiting-dir"
update_origin(self.table_discrete, os.path.join(dataset_dir, "data.csv"))
self.assertEqual(
self.table_discrete.domain[self.var_discrete].attributes["origin"],
self.alt_dir.name,
)


if __name__ == '__main__':
unittest.main()
49 changes: 29 additions & 20 deletions Orange/data/tests/test_variable.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Test methods with long descriptive names can omit docstrings
# pylint: disable=missing-docstring
# pylint: disable=protected-access
import csv
import os
import sys
import math
Expand All @@ -10,7 +11,7 @@
import warnings
from datetime import datetime, timezone

from io import StringIO
from tempfile import NamedTemporaryFile, TemporaryDirectory

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -714,27 +715,35 @@ def test_no_date_no_time(self):
self.assertEqual(TimeVariable('relative time').repr_val(1.6), '1.6')

def test_readwrite_timevariable(self):
output_csv = StringIO()
input_csv = StringIO("""\
Date,Feature
time,continuous
,
1920-12-12,1.0
1920-12-13,3.0
1920-12-14,5.5
""")
for stream in (output_csv, input_csv):
stream.close = lambda: None # HACK: Prevent closing of streams

table = CSVReader(input_csv).read()
self.assertIsInstance(table.domain['Date'], TimeVariable)
self.assertEqual(table[0, 'Date'], '1920-12-12')
content = [
("Date", "Feature"),
("time", "continuous"),
("", ""),
("1920-12-12", 1.0),
("1920-12-13", 3.0),
("1920-12-14", 5.5),
]
with NamedTemporaryFile(
mode="w", delete=False, newline="", encoding="utf-8"
) as input_csv:
csv.writer(input_csv, delimiter=",").writerows(content)

table = CSVReader(input_csv.name).read()
self.assertIsInstance(table.domain["Date"], TimeVariable)
self.assertEqual(table[0, "Date"], "1920-12-12")
# Dates before 1970 are negative
self.assertTrue(all(inst['Date'] < 0 for inst in table))
self.assertTrue(all(inst["Date"] < 0 for inst in table))

CSVReader.write_file(output_csv, table)
self.assertEqual(input_csv.getvalue().splitlines(),
output_csv.getvalue().splitlines())
with NamedTemporaryFile(mode="w", delete=False) as output_csv:
pass
CSVReader.write_file(output_csv.name, table)

with open(input_csv.name, encoding="utf-8") as in_f:
with open(output_csv.name, encoding="utf-8") as out_f:
self.assertEqual(in_f.read(), out_f.read())

os.unlink(input_csv.name)
os.unlink(output_csv.name)

def test_repr_value(self):
# https://github.com/biolab/orange3/pull/1760
Expand Down
Loading