Skip to content

Commit

Permalink
Merge pull request #5 from Haitham-ghaida/fix-ei-nov24-rel
Browse files Browse the repository at this point in the history
Fix ei nov24 rel
  • Loading branch information
cmutel authored Dec 5, 2024
2 parents 84f40f4 + 452769d commit 50f71c8
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 50 deletions.
91 changes: 67 additions & 24 deletions ecoinvent_migrate/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def generate_biosphere_mapping(
output_directory: Optional[Path] = None,
output_version: str = "3.0.0",
description: Optional[str] = None,
) -> Path:
) -> Optional[Path]:
"""Generate a Randonneur mapping file for biosphere edge attributes from source to target."""
configure_logs(write_logs=write_logs)

Expand Down Expand Up @@ -246,23 +246,52 @@ def generate_biosphere_mapping(
description = f"Data migration file from {source_db_name} to {target_db_name} generated with `ecoinvent_migrate` version {__version__}"

if not missing_sheet:
data = pd.read_excel(io=excel_filepath, sheet_name=candidates[0]).to_dict(orient="records")
data = source_target_biosphere_pair(
data=data,
source_version=source_version,
target_version=target_version,
keep_deletions=keep_deletions,
)
affected_uuids = {
o["source"]["uuid"]
for o in itertools.chain(data.get("replace", []), data.get("delete", []))
}
data = supplement_biosphere_changes_with_real_data_comparison(
data=data,
affected_uuids=affected_uuids,
source_version=source_version,
target_version=target_version,
)
# Try reading the sheet
df = pd.read_excel(io=excel_filepath, sheet_name=candidates[0])

# Handle the multi-index case
if df.columns[0].startswith("**"):
logger.debug("Detected multi-index format, adjusting reading parameters")
df = pd.read_excel(io=excel_filepath, sheet_name=candidates[0], skiprows=1)

# Handle the new format case
if "deleted exchanges" in df.columns:
logger.debug("Detected new exchange format, adjusting data structure")
# Get the actual column headers from the first row
new_headers = {col: val for col, val in df.iloc[0].items() if isinstance(val, str)}
df = df.rename(columns=new_headers).iloc[1:]

if df.empty:
logger.info(
"EE Deletions sheet is empty in change report for {source_v} to {target_v}. This likely means no biosphere changes.",
source_v=source_version,
target_v=target_version,
)
data = {"delete": [], "replace": []}
else:
data = df.to_dict(orient="records")
data = source_target_biosphere_pair(
data=data,
source_version=source_version,
target_version=target_version,
keep_deletions=keep_deletions,
)
# Ensure both keys exist
if "delete" not in data:
data["delete"] = []
if "replace" not in data:
data["replace"] = []

affected_uuids = {
o["source"]["uuid"]
for o in itertools.chain(data.get("replace", []), data.get("delete", []))
}
data = supplement_biosphere_changes_with_real_data_comparison(
data=data,
affected_uuids=affected_uuids,
source_version=source_version,
target_version=target_version,
)
else:
data = supplement_biosphere_changes_with_real_data_comparison(
data={"delete": [], "replace": []},
Expand All @@ -271,16 +300,28 @@ def generate_biosphere_mapping(
target_version=target_version,
)

if not data["delete"] and not data["replace"]:
logger.info("It seems like there are no biosphere changes for this release. Doing nothing.")
return
# Ensure we have non-empty data before creating Datapackage
has_data = False
cleaned_data = {}
for key in ["delete", "replace"]:
if data.get(key) and len(data[key]) > 0:
cleaned_data[key] = data[key]
has_data = True

if not has_data:
logger.info("No valid biosphere changes found after processing. Doing nothing.")
return None

dp = Datapackage(
name=f"{source_db_name}-{target_db_name}",
description=description,
contributors=[
{"title": "ecoinvent association", "path": "https://ecoinvent.org/", "roles": ["author"]},
{"title": "Chris Mutel", "path": "https://chris.mutel.org/", "roles": ["wrangler"]},
{
"title": "ecoinvent association",
"path": "https://ecoinvent.org/",
"role": "author",
},
{"title": "Chris Mutel", "path": "https://chris.mutel.org/", "role": "wrangler"},
],
mapping_source=MappingConstants.ECOSPOLD2_BIO,
mapping_target=MappingConstants.ECOSPOLD2_BIO,
Expand All @@ -290,7 +331,9 @@ def generate_biosphere_mapping(
target_id=target_db_name,
licenses=licenses,
)
for key, value in data.items():

# Only add non-empty data sections
for key, value in cleaned_data.items():
dp.add_data(key, value)

if write_file:
Expand Down
100 changes: 74 additions & 26 deletions ecoinvent_migrate/wrangling.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,21 +311,42 @@ def split_replace_disaggregate(data: List[dict], target_lookup: dict) -> dict:


def get_column_labels(example: dict, version: str) -> dict:
"""Guess column labels from Excel change report annex."""
"""Guess column labels from Excel change report annex.
Now handles multiple formats:
- Standard format: "UUID/ID - version"
- New format: Where labels might be in values
"""
uuid_tries = [f"UUID - {version}", f"ID - {version}"]
name_tries = [f"Name - {version}", f"{version} name", f"{version} - name"]

# Try standard format first
for uuid_try in uuid_tries:
if uuid_try in example:
uuid = uuid_try
break
else:
raise ValueError(f"Can't find uuid field for database version {version} in {example}")
name_tries = [f"Name - {version}", f"{version} name", f"{version} - name"]
# If standard format fails, check for new format
for key, value in example.items():
if isinstance(value, str) and any(try_pattern in value for try_pattern in uuid_tries):
uuid = key
break
else:
raise ValueError(f"Can't find uuid field for database version {version} in {example}")

# Same pattern for name
for name_try in name_tries:
if name_try in example:
name = name_try
break
else:
raise ValueError(f"Can't find name field for database version {version} in {example}")
for key, value in example.items():
if isinstance(value, str) and any(try_pattern in value for try_pattern in name_tries):
name = key
break
else:
raise ValueError(f"Can't find name field for database version {version} in {example}")

return {
"uuid": uuid,
"name": name,
Expand All @@ -335,33 +356,56 @@ def get_column_labels(example: dict, version: str) -> dict:
def source_target_biosphere_pair(
data: List[dict], source_version: str, target_version: str, keep_deletions: bool
) -> List[dict]:
"""Turn pandas DataFrame rows into source/target pairs."""
"""Turn pandas DataFrame rows into source/target pairs.
The function now handles both old and new EE Deletions formats:
- Old format: Direct source/target columns
- New format: Deletion/replacement columns with explicit relationships
"""
# For empty data, return empty structure
if not data:
return {"replace": [], "delete": []}

# Try old format first
source_labels = get_column_labels(example=data[0], version=source_version)
target_labels = get_column_labels(example=data[0], version=target_version)

formatted = {
"replace": [
{
"source": {k: row[v] for k, v in source_labels.items()},
"target": {k: row[v] for k, v in target_labels.items()},
"conversion_factor": float(row.get("Conversion Factor (old-new)", 1.0)),
"comment": row.get("Comment"),
}
for row in data
if not isnan(row[target_labels["uuid"]])
]
}
if keep_deletions:
formatted["delete"] = [
{
"source": {k: row[v] for k, v in source_labels.items()},
"comment": row.get("Comment"),
}
for row in data
if isnan(row[target_labels["uuid"]])
]
# Initialize the result structure
formatted = {"replace": [], "delete": [] if keep_deletions else None}

# Process each row
for row in data:
# Skip empty or invalid rows
if any(isnan(row.get(v)) for v in source_labels.values()):
continue

# Create source entry
source_entry = {k: row[v] for k, v in source_labels.items()}

# Check if there's a valid target
has_target = not any(isnan(row.get(v, float("nan"))) for v in target_labels.values())

if has_target:
formatted["replace"].append(
{
"source": source_entry,
"target": {k: row[v] for k, v in target_labels.items()},
"conversion_factor": float(row.get("Conversion Factor (old-new)", 1.0)),
"comment": row.get("Comment"),
}
)
elif keep_deletions:
formatted["delete"].append(
{
"source": source_entry,
"comment": row.get("Comment"),
}
)

# Clean up the formatted data
for lst in formatted.values():
if lst is None:
continue
for obj in lst:
if "comment" in obj and (not obj["comment"] or isnan(obj["comment"])):
del obj["comment"]
Expand All @@ -370,4 +414,8 @@ def source_target_biosphere_pair(
):
del obj["conversion_factor"]

# Remove empty delete list if not keeping deletions
if not keep_deletions:
del formatted["delete"]

return formatted

0 comments on commit 50f71c8

Please sign in to comment.