biolab · janezd · Oct 6, 2023 · Aug 28, 2023 · Aug 29, 2023 · Aug 29, 2023
diff --git a/Orange/data/io.py b/Orange/data/io.py
@@ -24,7 +24,7 @@
 import xlsxwriter
 import openpyxl
 
-from Orange.data import _io, Table, Domain, ContinuousVariable
+from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin
 from Orange.data import Compression, open_compressed, detect_encoding, \
     isnastr, guess_data_type, sanitize_variable
 from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL
@@ -164,21 +164,15 @@ def read(self):
                         skipinitialspace=True,
                     )
                     data = self.data_table(reader)
-
-                    # TODO: Name can be set unconditionally when/if
-                    # self.filename will always be a string with the file name.
-                    # Currently, some tests pass StringIO instead of
-                    # the file name to a reader.
-                    if isinstance(self.filename, str):
-                        data.name = path.splitext(
-                            path.split(self.filename)[-1])[0]
+                    data.name = path.splitext(path.split(self.filename)[-1])[0]
                     if error and isinstance(error, UnicodeDecodeError):
                         pos, endpos = error.args[2], error.args[3]
                         warning = ('Skipped invalid byte(s) in position '
                                    '{}{}').format(pos,
                                                   ('-' + str(endpos)) if (endpos - pos) > 1 else '')
                         warnings.warn(warning)
                     self.set_table_metadata(self.filename, data)
+                    update_origin(data, self.filename)
                     return data
                 except Exception as e:
                     error = e
@@ -215,6 +209,7 @@ def read(self):
             if not isinstance(table, Table):
                 raise TypeError("file does not contain a data table")
             else:
+                update_origin(table, self.filename)
                 return table
 
     @classmethod
@@ -264,6 +259,7 @@ def read(self):
         try:
             cells = self.get_cells()
             table = self.data_table(cells)
+            update_origin(table, self.filename)
             table.name = path.splitext(path.split(self.filename)[-1])[0]
             if self.sheet and len(self.sheets) > 1:
                 table.name = '-'.join((table.name, self.sheet))

diff --git a/Orange/data/io_util.py b/Orange/data/io_util.py
@@ -1,17 +1,27 @@
+import os.path
 import subprocess
 from collections import defaultdict
+from typing import Tuple, Optional
 
 import numpy as np
+import pandas as pd
 from chardet.universaldetector import UniversalDetector
 
 from Orange.data import (
     is_discrete_values, MISSING_VALUES, Variable,
-    DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable,
+    DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable, Table,
 )
 from Orange.misc.collections import natural_sorted
 
-__all__ = ["Compression", "open_compressed", "detect_encoding", "isnastr",
-           "guess_data_type", "sanitize_variable"]
+__all__ = [
+    "Compression",
+    "open_compressed",
+    "detect_encoding",
+    "isnastr",
+    "guess_data_type",
+    "sanitize_variable",
+    "update_origin",
+]
 
 
 class Compression:
@@ -207,3 +217,69 @@ def mapvalues(arr):
         values = [_var.parse(i) for i in orig_values]
 
     return values, var
+
+
+def _extract_new_origin(attr: Variable, table: Table, lookup_dirs: Tuple[str]) -> Optional[str]:
+    # origin exists
+    if os.path.exists(attr.attributes["origin"]):
+        return attr.attributes["origin"]
+
+    # last dir of origin in lookup dirs
+    dir_ = os.path.basename(os.path.normpath(attr.attributes["origin"]))
+    for ld in lookup_dirs:
+        new_dir = os.path.join(ld, dir_)
+        if os.path.isdir(new_dir):
+            return new_dir
+
+    # all column paths in lookup dirs
+    for ld in lookup_dirs:
+        if all(
+            os.path.exists(os.path.join(ld, attr.str_val(v)))
+            for v in table.get_column(attr)
+            if v and not pd.isna(v)
+        ):
+            return ld
+
+    return None
+
+
+def update_origin(table: Table, file_path: str):
+    """
+    When a dataset with file paths in the column is moved to another computer,
+    the absolute path may not be correct. This function updates the path for all
+    columns with an "origin" attribute.
+
+    The process consists of two steps. First, we identify directories to search
+    for files, and in the second step, we check if paths exist.
+
+    Lookup directories:
+    1. The directory where the file from file_path is placed
+    2. The parent directory of 1. The situation when the user places dataset
+       file in the directory with files (for example, workflow in a directory
+       with images)
+
+    Possible situations for file search:
+    1. The last directory of origin (basedir) is in one of the lookup directories
+    2. Origin doesn't exist in any lookup directories, but paths in a column can
+       be found in one of the lookup directories. This is usually a situation
+       when paths in a column are complex (e.g. a/b/c/d/file.txt).
+
+    Note: This function updates the existing table
+
+    Parameters
+    ----------
+    table
+        Orange Table to be updated if origin exits in any column
+    file_path
+        Path of the loaded dataset for reference. Only paths inside datasets
+        directory or its parent directory will be considered for new origin.
+    """
+    file_dir = os.path.dirname(file_path)
+    parent_dir = os.path.dirname(file_dir)
+    # if file_dir already root file_dir == parent_dir
+    lookup_dirs = tuple({file_dir: 0, parent_dir: 0})
+    for attr in table.domain.metas:
+        if "origin" in attr.attributes and (attr.is_string or attr.is_discrete):
+            new_orig = _extract_new_origin(attr, table, lookup_dirs)
+            if new_orig:
+                attr.attributes["origin"] = new_orig
diff --git a/Orange/data/tests/test_io_util.py b/Orange/data/tests/test_io_util.py
@@ -1,6 +1,18 @@
+import os.path
 import unittest
+from tempfile import TemporaryDirectory
 
-from Orange.data import ContinuousVariable, guess_data_type
+import numpy as np
+
+from Orange.data import (
+    ContinuousVariable,
+    guess_data_type,
+    Table,
+    Domain,
+    StringVariable,
+    DiscreteVariable,
+)
+from Orange.data.io_util import update_origin
 
 
 class TestIoUtil(unittest.TestCase):
@@ -10,5 +22,115 @@ def test_guess_continuous_w_nans(self):
             ContinuousVariable)
 
 
+class TestUpdateOrigin(unittest.TestCase):
+    FILE_NAMES = ["file1.txt", "file2.txt", "file3.txt"]
+
+    def setUp(self) -> None:
+        self.alt_dir = TemporaryDirectory()  # pylint: disable=consider-using-with
+
+        self.var_string = var = StringVariable("Files")
+        files = self.FILE_NAMES + [var.Unknown]
+        self.table_string = Table.from_list(
+            Domain([], metas=[var]), np.array(files).reshape((-1, 1))
+        )
+        self.var_discrete = var = DiscreteVariable("Files", values=self.FILE_NAMES)
+        files = self.FILE_NAMES + [var.Unknown]
+        self.table_discrete = Table.from_list(
+            Domain([], metas=[var]), np.array(files).reshape((-1, 1))
+        )
+
+    def tearDown(self) -> None:
+        self.alt_dir.cleanup()
+
+    def __create_files(self):
+        for f in self.FILE_NAMES:
+            f = os.path.join(self.alt_dir.name, f)
+            with open(f, "w", encoding="utf8"):
+                pass
+            self.assertTrue(os.path.exists(f))
+
+    def test_origin_not_changed(self):
+        """
+        Origin exist; keep it unchanged, even though dataset path also includes
+        files from column.
+        """
+        with TemporaryDirectory() as dir_name:
+            self.var_string.attributes["origin"] = dir_name
+            update_origin(self.table_string, self.alt_dir.name)
+            self.assertEqual(
+                self.table_string.domain[self.var_string].attributes["origin"], dir_name
+            )
+
+    def test_origin_subdir(self):
+        """
+        Origin is wrong but last dir in origin exit in the dataset file's path
+        """
+        images_dir = os.path.join(self.alt_dir.name, "subdir")
+        os.mkdir(images_dir)
+
+        self.var_string.attributes["origin"] = "/a/b/subdir"
+        update_origin(self.table_string, os.path.join(self.alt_dir.name, "data.csv"))
+        self.assertEqual(
+            self.table_string.domain[self.var_string].attributes["origin"], images_dir
+        )
+
+    def test_origin_parents_subdir(self):
+        """
+        Origin is wrong but last dir in origin exit in the dataset file
+        parent's directory
+        """
+        # make the dir where dataset is placed
+        images_dir = os.path.join(self.alt_dir.name, "subdir")
+        os.mkdir(images_dir)
+
+        self.var_string.attributes["origin"] = "/a/b/subdir"
+        update_origin(self.table_string, os.path.join(images_dir, "data.csv"))
+        self.assertEqual(
+            self.table_string.domain[self.var_string].attributes["origin"], images_dir
+        )
+
+    def test_column_paths_subdir(self):
+        """
+        Origin dir not exiting but paths from column exist in dataset's dir
+        """
+        self.__create_files()
+
+        self.var_string.attributes["origin"] = "/a/b/non-exiting-dir"
+        update_origin(self.table_string, os.path.join(self.alt_dir.name, "data.csv"))
+        self.assertEqual(
+            self.table_string.domain[self.var_string].attributes["origin"],
+            self.alt_dir.name,
+        )
+
+        self.var_discrete.attributes["origin"] = "/a/b/non-exiting-dir"
+        update_origin(self.table_discrete, os.path.join(self.alt_dir.name, "data.csv"))
+        self.assertEqual(
+            self.table_discrete.domain[self.var_discrete].attributes["origin"],
+            self.alt_dir.name,
+        )
+
+    def test_column_paths_parents_subdir(self):
+        """
+        Origin dir not exiting but paths from column exist in dataset parent's dir
+        """
+        # make the dir where dataset is placed
+        dataset_dir = os.path.join(self.alt_dir.name, "subdir")
+        self.__create_files()
+
+        self.var_string.attributes["origin"] = "/a/b/non-exiting-dir"
+        update_origin(self.table_string, os.path.join(dataset_dir, "data.csv"))
+        self.assertEqual(
+            self.table_string.domain[self.var_string].attributes["origin"],
+            self.alt_dir.name,
+        )
+
+        self.var_discrete.attributes["origin"] = "/a/b/non-exiting-dir"
+        update_origin(self.table_discrete, os.path.join(dataset_dir, "data.csv"))
+        self.assertEqual(
+            self.table_discrete.domain[self.var_discrete].attributes["origin"],
+            self.alt_dir.name,
+        )
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/Orange/data/tests/test_variable.py b/Orange/data/tests/test_variable.py
@@ -1,6 +1,7 @@
 # Test methods with long descriptive names can omit docstrings
 # pylint: disable=missing-docstring
 # pylint: disable=protected-access
+import csv
 import os
 import sys
 import math
@@ -10,7 +11,7 @@
 import warnings
 from datetime import datetime, timezone
 
-from io import StringIO
+from tempfile import NamedTemporaryFile, TemporaryDirectory
 
 import numpy as np
 import pandas as pd
@@ -714,27 +715,35 @@ def test_no_date_no_time(self):
         self.assertEqual(TimeVariable('relative time').repr_val(1.6), '1.6')
 
     def test_readwrite_timevariable(self):
-        output_csv = StringIO()
-        input_csv = StringIO("""\
-Date,Feature
-time,continuous
-,
-1920-12-12,1.0
-1920-12-13,3.0
-1920-12-14,5.5
-""")
-        for stream in (output_csv, input_csv):
-            stream.close = lambda: None  # HACK: Prevent closing of streams
-
-        table = CSVReader(input_csv).read()
-        self.assertIsInstance(table.domain['Date'], TimeVariable)
-        self.assertEqual(table[0, 'Date'], '1920-12-12')
+        content = [
+            ("Date", "Feature"),
+            ("time", "continuous"),
+            ("", ""),
+            ("1920-12-12", 1.0),
+            ("1920-12-13", 3.0),
+            ("1920-12-14", 5.5),
+        ]
+        with NamedTemporaryFile(
+            mode="w", delete=False, newline="", encoding="utf-8"
+        ) as input_csv:
+            csv.writer(input_csv, delimiter=",").writerows(content)
+
+        table = CSVReader(input_csv.name).read()
+        self.assertIsInstance(table.domain["Date"], TimeVariable)
+        self.assertEqual(table[0, "Date"], "1920-12-12")
         # Dates before 1970 are negative
-        self.assertTrue(all(inst['Date'] < 0 for inst in table))
+        self.assertTrue(all(inst["Date"] < 0 for inst in table))
 
-        CSVReader.write_file(output_csv, table)
-        self.assertEqual(input_csv.getvalue().splitlines(),
-                         output_csv.getvalue().splitlines())
+        with NamedTemporaryFile(mode="w", delete=False) as output_csv:
+            pass
+        CSVReader.write_file(output_csv.name, table)
+
+        with open(input_csv.name, encoding="utf-8") as in_f:
+            with open(output_csv.name, encoding="utf-8") as out_f:
+                self.assertEqual(in_f.read(), out_f.read())
+
+        os.unlink(input_csv.name)
+        os.unlink(output_csv.name)
 
     def test_repr_value(self):
         # https://github.com/biolab/orange3/pull/1760