Skip to content

Commit

Permalink
Merge pull request #36 from CanDIG/daisieh/redcap
Browse files Browse the repository at this point in the history
DIG-1344: make sure no dup IDs in schemas
  • Loading branch information
daisieh authored Oct 30, 2023
2 parents 0a3a34b + d1d3937 commit fb5d76e
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 177 deletions.
25 changes: 18 additions & 7 deletions CSVConvert.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def map_data_to_scaffold(node, line, rownum):
return result
if "str" in str(type(node)) and node != "":
result = eval_mapping(node, rownum)
verbose_print(f"Evaluated result is {result}")
verbose_print(f"Evaluated result is {result}, {node}, {rownum}")
return result
if "dict" in str(type(node)):
result = {}
Expand All @@ -59,6 +59,17 @@ def map_data_to_scaffold(node, line, rownum):
linekey = f"{line}.{key}"
dict = map_data_to_scaffold(node[key], f"{linekey}", rownum)
if dict is not None:
if "CALCULATED" not in mappings.INDEXED_DATA["data"]:
mappings.INDEXED_DATA["data"]["CALCULATED"] = {}
if mappings.IDENTIFIER not in mappings.INDEXED_DATA["data"]["CALCULATED"]:
mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER] = {}
if key not in mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER]:
mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER][key] = []
mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER][key].append(dict)
if key not in mappings.INDEXED_DATA["columns"]:
mappings.INDEXED_DATA["columns"][key] = []
if "CALCULATED" not in mappings.INDEXED_DATA["columns"][key]:
mappings.INDEXED_DATA["columns"][key].append("CALCULATED")
result[key] = dict
if result is not None and len(result) == 0:
return None
Expand Down Expand Up @@ -193,7 +204,7 @@ def get_row_for_stack_top(sheet, rownum):
result = {}
for param in mappings.INDEXED_DATA["data"][sheet][mappings.IDENTIFIER].keys():
result[param] = mappings.INDEXED_DATA["data"][sheet][mappings.IDENTIFIER][param][rownum]
verbose_print(f"get_row_for_stack_top is {result}")
verbose_print(f"get_row_for_stack_top {sheet} is {result}")
return result


Expand Down Expand Up @@ -581,14 +592,14 @@ def csv_convert(input_path, manifest_file, verbose=False):

# # read the raw data
print("Reading raw data")
raw_csv_dfs, output_file = ingest_raw_data(input_path)
raw_csv_dfs, mappings.OUTPUT_FILE = ingest_raw_data(input_path)
if not raw_csv_dfs:
print(f"No ingestable files (csv or xlsx) were found at {input_path}")
return

print("Indexing data")
mappings.INDEXED_DATA = process_data(raw_csv_dfs)
with open(f"{output_file}_indexed.json", 'w') as f:
with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f:
json.dump(mappings.INDEXED_DATA, f, indent=4)

# if verbose flag is set, warn if column name is present in multiple sheets:
Expand Down Expand Up @@ -619,7 +630,7 @@ def csv_convert(input_path, manifest_file, verbose=False):
if mappings._pop_from_stack() is not None:
raise Exception(f"Stack not empty\n{mappings.IDENTIFIER_FIELD}: {mappings.IDENTIFIER}\n {mappings.INDEX_STACK}")

with open(f"{output_file}_indexed.json", 'w') as f:
with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f:
json.dump(mappings.INDEXED_DATA, f, indent=4)

result = {
Expand All @@ -628,14 +639,14 @@ def csv_convert(input_path, manifest_file, verbose=False):
}
if schema.katsu_sha is not None:
result["katsu_sha"] = schema.katsu_sha
with open(f"{output_file}_map.json", 'w') as f: # write to json file for ingestion
with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion
json.dump(result, f, indent=4)

# add validation data:
schema.validate_ingest_map(result)
result["validation_errors"] = schema.validation_failures
result["statistics"] = schema.statistics
with open(f"{output_file}_map.json", 'w') as f: # write to json file for ingestion
with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion
json.dump(result, f, indent=4)

if len(result["validation_errors"]) > 0:
Expand Down
102 changes: 0 additions & 102 deletions ingest_redcap_data.py

This file was deleted.

11 changes: 11 additions & 0 deletions mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@
INDEX_STACK = []
INDEXED_DATA = None
CURRENT_LINE = ""
OUTPUT_FILE = ""


class MappingError(Exception):
def __init__(self, value):
self.value = value

def __str__(self):
with open(f"{OUTPUT_FILE}_indexed.json", 'w') as f:
json.dump(INDEXED_DATA, f, indent=4)
return repr(f"Check the values for {IDENTIFIER} in {IDENTIFIER_FIELD}: {self.value}")


Expand Down Expand Up @@ -118,6 +121,14 @@ def flat_list_val(data_values):
return all_items


# concatenate several data values
def concat_vals(data_values):
result = []
for x in data_values:
result.extend(data_values[x].values())
return "_".join(result)


# Convert various responses to boolean
def boolean(data_values):
cell = single_val(data_values)
Expand Down
Loading

0 comments on commit fb5d76e

Please sign in to comment.