Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update pack and unpack methods #886

Merged
merged 14 commits into from
May 23, 2024
118 changes: 68 additions & 50 deletions pyiron_base/project/archiving/export_archive.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import os
import numpy as np
from shutil import copyfile
from shutil import copyfile, rmtree, copytree
from pyfileindex import PyFileIndex
import tarfile
from shutil import rmtree
import tempfile
from pyiron_base.project.archiving.shared import getdir
from pyiron_base.utils.instance import static_isinstance


def new_job_id(job_id, job_translate_dict):
Expand Down Expand Up @@ -39,7 +38,7 @@ def filter_function(file_name):
return ".h5" in file_name


def generate_list_of_directories(df_files, directory_to_transfer, archive_directory):
def generate_list_of_directories(df_files, directory_to_transfer):
path_rel_lst = [
os.path.relpath(d, directory_to_transfer) for d in df_files.dirname.unique()
]
Expand All @@ -54,16 +53,20 @@ def generate_list_of_directories(df_files, directory_to_transfer, archive_direct
]


def compress_dir(archive_directory):
def compress_dir(directory_to_transfer, archive_directory):
arch_comp_name = archive_directory + ".tar.gz"
tar = tarfile.open(arch_comp_name, "w:gz")
tar.add(os.path.relpath(archive_directory, os.getcwd()))
tar.close()
with tarfile.open(arch_comp_name, "w:gz") as tar:
tar.add(archive_directory, arcname=os.path.basename(directory_to_transfer))
rmtree(archive_directory)
return arch_comp_name


def copy_files_to_archive(
directory_to_transfer, archive_directory, compressed=True, copy_all_files=False
project,
directory_to_transfer,
archive_directory,
compressed=True,
copy_all_files=False,
):
"""
Create an archive of jobs in directory_to_transfer.
Expand All @@ -79,64 +82,70 @@ def copy_files_to_archive(
if not compressed:
compressed = True

if directory_to_transfer[-1] != "/":
directory_to_transfer = os.path.basename(directory_to_transfer)
directory_to_transfer = os.path.normpath(directory_to_transfer)
archive_directory = os.path.normpath(archive_directory)

tempdir = export_files(
directory_to_transfer, compressed, copy_all_files=copy_all_files
)
df = export_database(project, directory_to_transfer, archive_directory)
csv_file_name = os.path.join(tempdir.name, "export.csv")
df.to_csv(csv_file_name)

if compressed:
archived_file = compress_dir(directory_to_transfer, tempdir.name)
copyfile(
archived_file,
os.path.join(
os.path.dirname(os.path.abspath(archive_directory)),
f"{os.path.basename(directory_to_transfer)}.tar.gz",
),
)
else:
directory_to_transfer = os.path.basename(directory_to_transfer[:-1])
# print("directory to transfer: "+directory_to_transfer)
if os.path.exists(archive_directory):
raise ValueError("Folder exists, give different name or allow compression")
# now copy the whole set of folders
copytree(tempdir.name, archive_directory)


def export_files(directory_to_transfer, compressed, copy_all_files=False):
if not copy_all_files:
pfi = PyFileIndex(path=directory_to_transfer, filter_function=filter_function)
else:
pfi = PyFileIndex(path=directory_to_transfer)
df_files = pfi.dataframe[~pfi.dataframe.is_directory]

# create a temporary folder for archiving
tempdir = tempfile.TemporaryDirectory()

# Create directories
dir_lst = generate_list_of_directories(
df_files=df_files,
directory_to_transfer=directory_to_transfer,
archive_directory=archive_directory,
df_files=df_files, directory_to_transfer=directory_to_transfer
)
# print(dir_lst)

# now make these directories
for d in dir_lst:
os.makedirs(d, exist_ok=True)
# Copy files
dir_name_transfer = getdir(path=directory_to_transfer)
os.makedirs(d.replace(directory_to_transfer, tempdir.name), exist_ok=True)

# copy files
for f in df_files.path.values:
copyfile(
f,
os.path.join(
archive_directory,
dir_name_transfer,
tempdir.name,
os.path.relpath(f, directory_to_transfer),
),
)
if compressed:
compress_dir(archive_directory)

return tempdir

def export_database(project_instance, directory_to_transfer, archive_directory):

def export_database(pr, directory_to_transfer, archive_directory):
# here we first check wether the archive directory is a path
# or a project object
if isinstance(archive_directory, str):
if archive_directory[-7:] == ".tar.gz":
archive_directory = archive_directory[:-7]
archive_directory = os.path.basename(archive_directory)
# if the archive_directory is a project
elif static_isinstance(
obj=archive_directory.__class__,
obj_type=[
"pyiron_base.project.generic.Project",
],
):
archive_directory = archive_directory.path
else:
raise RuntimeError(
"""the given path for exporting to,
does not have the correct format paths as string
or pyiron Project objects are expected"""
)

directory_to_transfer = os.path.basename(directory_to_transfer)
pr = project_instance.open(os.curdir)

df = pr.job_table()
job_ids_sorted = sorted(df.id.values)
new_job_ids = list(range(len(job_ids_sorted)))
Expand All @@ -153,11 +162,20 @@ def export_database(project_instance, directory_to_transfer, archive_directory):
new_job_id(job_id=job_id, job_translate_dict=job_translate_dict)
for job_id in df.parentid
]
df["project"] = update_project(
project_instance,
directory_to_transfer=directory_to_transfer,
archive_directory=archive_directory,
df=df,
)

# figure if we need to update archive names
path_rel_lst = [os.path.relpath(os.path.normpath(p)) for p in df["project"].values]
if os.path.basename(directory_to_transfer) != os.path.basename(archive_directory):
# we need to update the project name
path_rel_lst = [
p.replace(
os.path.basename(directory_to_transfer),
os.path.basename(archive_directory),
)
for p in path_rel_lst
]

df["project"] = path_rel_lst
del df["projectpath"]

return df
85 changes: 39 additions & 46 deletions pyiron_base/project/archiving/import_archive.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
import os
import pandas
import numpy as np
from shutil import rmtree
from distutils.dir_util import copy_tree
import tarfile
from pyiron_base.project.archiving.shared import getdir
from pyiron_base.utils.instance import static_isinstance
from pyiron_base.state import state


Expand All @@ -21,56 +17,52 @@ def update_id_lst(record_lst, job_id_lst):


def extract_archive(archive_directory):
fname = archive_directory + ".tar.gz"
tar = tarfile.open(fname, "r:gz")
tar.extractall()
tar.close()


def import_jobs(project_instance, archive_directory, df, compressed=True):
# Copy HDF5 files
# if the archive_directory is a path(string)/name of the compressed file
if static_isinstance(
obj=archive_directory.__class__,
obj_type=[
"pyiron_base.project.generic.Project",
],
):
archive_directory = archive_directory.path
elif isinstance(archive_directory, str):
if archive_directory[-7:] == ".tar.gz":
archive_directory = archive_directory[:-7]
if not compressed:
compressed = True
else:
raise RuntimeError(
"""the given path for importing from,
does not have the correct format paths
as string or pyiron Project objects are expected"""
arch_comp_name = archive_directory + ".tar.gz"
with tarfile.open(arch_comp_name, "r:gz") as tar:
tar.extractall()


def import_jobs_to_new_project(cls, archive_directory, compressed=True):
pass


def import_jobs_to_existing_project(pr, archive_directory, compressed=True):
pass


def prepare_path(pr, archive_directory):
if archive_directory[-7:] == ".tar.gz":
archive_directory = archive_directory[:-7]
elif not os.path.exists(archive_directory + ".tar.gz"):
raise FileNotFoundError("Cannot find archive")

arch_comp_name = archive_directory + ".tar.gz"
with tarfile.open(arch_comp_name, "r:gz") as tar:
target_folder = os.path.join(
os.path.dirname(archive_directory), os.path.basename(tar.members[0].name)
)
if compressed:
extract_archive(os.path.relpath(archive_directory, os.getcwd()))

archive_name = getdir(path=archive_directory)
if os.path.exists(target_folder):
raise ValueError("Cannot extract to existing folder")

# destination folder
des = project_instance.path
# source folder; archive folder
src = os.path.abspath(archive_directory)
copy_tree(src, des)
if compressed:
rmtree(src)
return target_folder, archive_directory

# # Update Database
pr_import = project_instance.open(os.curdir)

def import_jobs(pr, archive_directory):
# now open and extract archive
extract_archive(archive_directory)

# read csv
csv_file_name = os.path.join(pr.path, "export.csv")
df = pandas.read_csv(csv_file_name, index_col=0)
df["project"] = [
os.path.join(pr_import.project_path, os.path.relpath(p, archive_name)) + "/"
os.path.join(pr.project_path, os.path.relpath(p, pr.project_path)) + "/"
for p in df["project"].values
]
df["projectpath"] = len(df) * [pr_import.root_path]
df["projectpath"] = len(df) * [pr.root_path]
# Add jobs to database
job_id_lst = []

for entry in df.dropna(axis=1).to_dict(orient="records"):
if "id" in entry:
del entry["id"]
Expand All @@ -84,16 +76,17 @@ def import_jobs(project_instance, archive_directory, df, compressed=True):
entry["timestop"] = pandas.to_datetime(entry["timestop"])
if "username" not in entry:
entry["username"] = state.settings.login_user
job_id = pr_import.db.add_item_dict(par_dict=entry)
job_id = pr.db.add_item_dict(par_dict=entry)
job_id_lst.append(job_id)

# print(job_id_lst)
# Update parent and master ids
for job_id, masterid, parentid in zip(
job_id_lst,
update_id_lst(record_lst=df["masterid"].values, job_id_lst=job_id_lst),
update_id_lst(record_lst=df["parentid"].values, job_id_lst=job_id_lst),
):
if masterid is not None or parentid is not None:
pr_import.db.item_update(
pr.db.item_update(
item_id=job_id, par_dict={"parentid": parentid, "masterid": masterid}
)
44 changes: 18 additions & 26 deletions pyiron_base/project/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,12 @@ class Project(ProjectPath, HasGroups):
"""

def __init__(
self, path="", user=None, sql_query=None, default_working_directory=False
self,
path="",
user=None,
sql_query=None,
default_working_directory=False,
unpack=False,
):
if default_working_directory and path == "":
inputdict = Notebook.get_custom_dict()
Expand All @@ -120,6 +125,9 @@ def __init__(
else:
path = "."

if unpack:
path, archive_directory = import_archive.prepare_path(self, path)

super(Project, self).__init__(path=path)

self.user = user
Expand All @@ -135,6 +143,9 @@ def __init__(

self._maintenance = None

if unpack:
self._unpack(archive_directory)

@property
def state(self):
return state
Expand Down Expand Up @@ -1819,7 +1830,7 @@ def _update_jobs_in_old_database_format(self, job_name):

def pack(
self,
destination_path,
destination_path=None,
csv_file_name="export.csv",
compress=True,
copy_all_files=False,
Expand All @@ -1834,37 +1845,18 @@ def pack(
copy_all_files (bool):
"""
directory_to_transfer = os.path.basename(self.path[:-1])
if destination_path == directory_to_transfer:
raise ValueError(
"The destination_path cannot have the same name as the project to compress."
)
if destination_path is None:
destination_path = directory_to_transfer
export_archive.copy_files_to_archive(
self,
directory_to_transfer,
destination_path,
compressed=compress,
copy_all_files=copy_all_files,
)
df = export_archive.export_database(
self, directory_to_transfer, destination_path
)
df.to_csv(csv_file_name)

def unpack(self, origin_path, csv_file_name="export.csv", compress=True):
"""
by this function, job table is imported from a given csv file,
and also the content of project directory is copied from a given path

Args:
origin_path (str): the relative path of a directory (or a compressed file without the tar.gz exention)
from which the project directory is copied.
csv_file_name (str): the csv file from which the job_table is copied to the current project
compress (bool): if True, it looks for a compressed file
"""
csv_path = csv_file_name
df = pandas.read_csv(csv_path, index_col=0)
import_archive.import_jobs(
self, archive_directory=origin_path, df=df, compressed=compress
)
def _unpack(self, origin_path):
import_archive.import_jobs(self, origin_path)

@classmethod
def register_tools(cls, name: str, tools):
Expand Down
Loading