From 5fd06cb5dedb85d5a51770018c3bfe424025fc0d Mon Sep 17 00:00:00 2001 From: arnav-singhal <43693924+arnav-singhal@users.noreply.github.com> Date: Tue, 6 Aug 2024 14:43:02 -0700 Subject: [PATCH 1/6] Create utilities.rst --- docs/source/usage/utilities.rst | 80 +++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 docs/source/usage/utilities.rst diff --git a/docs/source/usage/utilities.rst b/docs/source/usage/utilities.rst new file mode 100644 index 0000000..5153889 --- /dev/null +++ b/docs/source/usage/utilities.rst @@ -0,0 +1,80 @@ +Output Format +============= + +ExaEpi writes data on two levels: individual data and community data. + +Individual data fields: +----------------------- +Individual datasets have more fields written on day 0 that are constant across the simulation. Data is either written as an ``int`` or a ``real``. + +- Day 0 only (constant) + + - Int data: + + - ``age_group``: Ranges from 0 to 4, indicating age group of agent. Age ranges are 0-5, 5-17, 18-29, 30-64, and 65+, respectively. + + - ``family``: ID of family of agent. + + - ``home_i``: x-coordinate of home location on AMReX grid. Can be used to match particle to home community. + + - ``home_j``: y-coordinate of home location on AMReX grid. Can be used to match particle to home community. + + - ``work_i``: x-coordinate of work location on AMReX grid. Can be used to match particle to work community. + + - ``work_j``: y-coordinate of work location on AMReX grid. Can be used to match particle to work community. + + - ``nborhood``: Home neighborhood. + + - ``school``: Ranges from -1 to 5, indicating type of school. -1 indicates not of school age, 0 indicates of school age but not in school, 1 indicates high school, 2 indicates middle school, and 3-5 indicate different elementary schools. + + - ``work_nborhood``: Work neighborhood. + + - ``strain``: Marks which strain of disease (used for multiple diseases). + + - Real data: + + - ``incubation_period``: (currently misleadingly named) Latent period length, i.e. time between exposure and infectiousness. + + - ``infectious_period``: Infections period length, i.e. time during which agent can spread disease + + - ``symptomdev_period``: Symptom development period length (known as incubation period in some contexts), i.e. time between exposure and symptoms appearing. + +- Every day (time-varying) + + - Int data + + - ``withdrawn``: Marks whether the agent is withdrawn. 0 for not withdrawn, 1 for withdrawn. + + - ``status``: Ranges from 0 to 4, indicating disease status. Corresponds to never infected, infected, immune, susceptible, and dead, respectively. + + - ``symptomatic``: Ranges from 0 to 2, indicating symptomaticity. Corresponds to presymptomatic (not yet symptomatic but will be), symptomatic, and a symptomatic. + + - Real data + + - ``treatment_timer``: Time that agent has been in hospital. + + - ``disease_counter``: Time that agent has been infected. + + - ``infection_prob``: Probability that agent is infected at end of day. + +Community data fields: +---------------------- +All data is written as a real, although all data should be integers. If any community does not exist, its ``unit``, ``FIPS``, ``Tract``, and ``comm`` values will be -1, while all agent counts will be 0. + +- ``total``: Total population in community. + +- ``never_infected``: Number of never-infected agents in community. + +- ``infected``: Number of infected agents in community. + +- ``immune``: Number of immune (recently recovered from disease) agents in community. + +- ``susceptible``: Number of susceptible (recovered from disease long enough ago to be able to be infected again) agents in community. + +- ``unit``: Unit at cell. + +- ``FIPS``: FIPS code (county code) of community. + +- ``Tract``: Census tract of community. + +- ``comm``: Community ID (unique). From 2ea286cc6d1444b3c71ffa54c8003a26588408f1 Mon Sep 17 00:00:00 2001 From: arnav-singhal <43693924+arnav-singhal@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:03:18 -0700 Subject: [PATCH 2/6] Update utilities.rst --- docs/source/usage/utilities.rst | 105 +++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 8 deletions(-) diff --git a/docs/source/usage/utilities.rst b/docs/source/usage/utilities.rst index 5153889..00f55fe 100644 --- a/docs/source/usage/utilities.rst +++ b/docs/source/usage/utilities.rst @@ -1,19 +1,19 @@ Output Format ============= -ExaEpi writes data on two levels: individual data and community data. +ExaEpi writes data on two levels: individual data and community data. -Individual data fields: ------------------------ +Individual data fields +---------------------- Individual datasets have more fields written on day 0 that are constant across the simulation. Data is either written as an ``int`` or a ``real``. - Day 0 only (constant) - + - Int data: - + - ``age_group``: Ranges from 0 to 4, indicating age group of agent. Age ranges are 0-5, 5-17, 18-29, 30-64, and 65+, respectively. - - ``family``: ID of family of agent. + - ``family``: ID of family of agent. - ``home_i``: x-coordinate of home location on AMReX grid. Can be used to match particle to home community. @@ -57,8 +57,8 @@ Individual datasets have more fields written on day 0 that are constant across t - ``infection_prob``: Probability that agent is infected at end of day. -Community data fields: ----------------------- +Community data fields +--------------------- All data is written as a real, although all data should be integers. If any community does not exist, its ``unit``, ``FIPS``, ``Tract``, and ``comm`` values will be -1, while all agent counts will be 0. - ``total``: Total population in community. @@ -78,3 +78,92 @@ All data is written as a real, although all data should be integers. If any comm - ``Tract``: Census tract of community. - ``comm``: Community ID (unique). + +Reading output +============== +Output is read in different ways, depending on I/O format selected. If ``-DAMReX_HDF5`` flag is set to ``TRUE`` when compiling, HDF5 will be used. Otherwise, the default AMReX plotfile will be written. + +AMReX plotfile +-------------- +Any library that has functionality for AMReX plotfiles can be used: see `AMReX documentation `__ for more. + +We suggest using yt with Python: + +.. code-block:: python + + import yt + from yt.frontends.boxlib.data_structures import AMReXDataset + + ds = AMReXDataset("plt00000") + ad = ds.all_data() + + community_totals = ad["total"] + statuses = ad["particle_status"] + infection_probs = ad["particle_infection_prob"] + +Note that individual-level fields are prefixed by ``particle_``, while community-level fields are as-is. + +HDF5 +---- +The individual-level data is contained in files at ``plt*****/agents/agents.h5``, while the community-level data is conatined in files ``plt*****.h5``. Note that string attributes are written as byte strings, i.e. ``b'FIPS'`` instead of ``"FIPS"``. + +Individual-level data +^^^^^^^^^^^^^^^^^^^^^ +The data itself is located in the ``level_0`` group, separated by datatype. ``int`` data is at ``level_0/data:datatype=0`` and the ``real`` data is at ``level_0/data:datatype=1``. + +Data is written agent-by-agent, so all fields for a given agent will be contiguous. The first two ``int`` components for any agent are its CPU and its particle ID on the CPU, while the first two ``real`` components for any agent are its x- and y-positions on the AMReX grid. + +The names of other ``int`` components can be accessed as an attribute of the file named ``"int_component_x"``, where x ranges from 2 to the maximum number of components per agent. Not counting the first two non-labelled components, the number of ``int`` components is an attribute of the file named ``"num_component_int"``, so the total number of components is ``num_component_int + 2``. Thus, the size of the ``level_0/data:datatype=0`` dataset is N * (num_component_int + 2), where N is the number of agents. + +The same scheme applies to ``real`` data, with every instance of ``int`` replaced with ``real``. + +Community-level data +^^^^^^^^^^^^^^^^^^^^ +Each field is written as its own dataset in the ``level_0`` group. The number of datasets can be accessed as an attribute of the file named ``"num_components"``, and the name of each dataset can be accessed as the attribute named ``"component_x"`` where x ranges from 0 to ``num_components - 1``, and the corresponding dataset is found at ``level_0/data:datatype=x``. The order of the data is the same within a plotfile, so one can match the community IDs to the FIPS codes by index, for example. + +Python example +^^^^^^^^^^^^^^ +We provide an example of parsing files using h5py and python: + +.. code-block:: python + + import h5py + + with h5py.File("plt00000.h5", "r") as comm_file: + # Get the datatype number for each field + community_indicies = {} + for i in range(comm_file.attrs["num_components"][0]): + community_indices[comm_file.attrs["component_" + str(i)]] = i + # community_indices is: + # {b'total': 0, b'never_infected': 1, b'infected': 2, b'immune': 3, + # b'susceptible': 4, b'unit': 5, b'FIPS': 6, b'Tract': 7, b'comm': 8} + + # For example, get the totals + community_totals = comm_file["level_0"]["data:datatype=" + str(community_indicies[b'total'])] + + with h5py.File("plt00000/agents/agents.h5", "r") as agent_file: + # Get the index for each int field + int_indicies = {} + num_int_comps = agent_file.attrs["num_component_int"][0] + 2 + for i in range(2, num_int_comps): + int_indicies[agent_file.attrs["int_component_" + str(i)]] = i + + int_data = agent_file["level_0"]["data:datatype=0"][()].reshape(-1, num_int_comps) + statuses = int_data[:, int_indicies[b'status']] + + # Get the index for each real field + real_indicies = {} + num_real_comps = agent_file.attrs["num_component_real"][0] + 2 + for i in range(2, num_real_comps): + real_indicies[agent_file.attrs["real_component_" + str(i)]] = i + + real_data = agent_file["level_0"]["data:datatype=1"][()].reshape(-1, num_real_comps) + statuses = real_data[:, real_indicies[b'infection_prob']] + +Processing scripts +================== +We provide a few processing scripts for visualization and data aggregation. + +Visualization +------------- +There is From 63e75b627906ce3e48493a6f643f6c763ec5db24 Mon Sep 17 00:00:00 2001 From: Arnav Singhal Date: Tue, 6 Aug 2024 17:02:43 -0700 Subject: [PATCH 3/6] add page and finish viz --- docs/source/index.rst | 1 + docs/source/install/dependencies.rst | 3 +-- docs/source/usage/utilities.rst | 21 ++++++++++++++++----- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index cbe9b29..fac58a0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -50,6 +50,7 @@ Usage :hidden: usage/how_to_run + usage/utilities Development ----------- diff --git a/docs/source/install/dependencies.rst b/docs/source/install/dependencies.rst index 326eaa3..f7b77ea 100644 --- a/docs/source/install/dependencies.rst +++ b/docs/source/install/dependencies.rst @@ -18,5 +18,4 @@ Optional dependencies include: - `OpenMP 3.1+ `__: for threaded CPU execution - `CCache `__: to speed up rebuilds (needs 3.7.9+ for CUDA) - `Ninja `__: for faster parallel compiles -- `Parallel HDF5 `__: to write to HDF5. Parallel HDF5 is required, and modules with pre-built Parallel HDF5 installations are -available on most HPC machines. +- `Parallel HDF5 `__: to write to HDF5. Parallel HDF5 is required, and modules with pre-built Parallel HDF5 installations are available on most HPC machines. diff --git a/docs/source/usage/utilities.rst b/docs/source/usage/utilities.rst index 00f55fe..982448b 100644 --- a/docs/source/usage/utilities.rst +++ b/docs/source/usage/utilities.rst @@ -1,11 +1,14 @@ +Utilities +######### + Output Format ============= - ExaEpi writes data on two levels: individual data and community data. Individual data fields ---------------------- -Individual datasets have more fields written on day 0 that are constant across the simulation. Data is either written as an ``int`` or a ``real``. +Individual datasets have more fields written on day 0 that are constant across the simulation. +Data is either written as an ``int`` or a ``real``. - Day 0 only (constant) @@ -81,11 +84,14 @@ All data is written as a real, although all data should be integers. If any comm Reading output ============== -Output is read in different ways, depending on I/O format selected. If ``-DAMReX_HDF5`` flag is set to ``TRUE`` when compiling, HDF5 will be used. Otherwise, the default AMReX plotfile will be written. +Output is read in different ways, depending on I/O format selected. +If ``-DAMReX_HDF5`` flag is set to ``TRUE`` when compiling, HDF5 will be used. +Otherwise, the default AMReX plotfile will be written. AMReX plotfile -------------- -Any library that has functionality for AMReX plotfiles can be used: see `AMReX documentation `__ for more. +Any library that has functionality for AMReX plotfiles can be used: +see `AMReX documentation `__ for more. We suggest using yt with Python: @@ -166,4 +172,9 @@ We provide a few processing scripts for visualization and data aggregation. Visualization ------------- -There is +There are some visualization scripts located in ``utilities/plotMovie``. +Primarily, ``generate_frames.py`` allows one to generate still frames from +either native AMReX output or HDF5 output, plotting infections (or an arbitrary +function on community-level data) spatially across a given map (as a .shx file). +Some of these shapefiles (for USA, California, and the Bay Area) are located in +the ``data/`` directory. See the script file and its comments for further details. From a945f90ecc2fef6921af4d9458464a63867d4ab5 Mon Sep 17 00:00:00 2001 From: Arnav Singhal Date: Tue, 6 Aug 2024 17:23:55 -0700 Subject: [PATCH 4/6] clean up frames script --- utilities/plotMovie/generate_frames.py | 37 ++++++++++++-------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/utilities/plotMovie/generate_frames.py b/utilities/plotMovie/generate_frames.py index 40dbc45..3267acb 100644 --- a/utilities/plotMovie/generate_frames.py +++ b/utilities/plotMovie/generate_frames.py @@ -6,7 +6,11 @@ ffmpeg -framerate 1 -r 30 -i frames/frame%05d.png -pix_fmt yuv420p movie.mp4 Can adjust inputs directly at bottom of file or as command-line input: -python generate_frames.py [sim results dir] [.shp dir] [output dir (optional)] +python generate_frames.py [sim results dir] [.shx dir] [output dir (optional)] + +The function being plotted can be altered in the get_raw() functions defined +in the get_raw_data() and get_raw_data_hdf5() functions, such as plotting +cumulative infections, proportions of infections, or raw counts. Endpoints of color range can be set in or passed into generate_plot, or in main function, as necessary. @@ -19,19 +23,21 @@ """ import numpy as np +import pandas as pd +import geopandas as gpd +# h5py is unnecessary if using native AMReX output import h5py +# yt is unnecessary if using HDF5 output import yt -from yt.frontends import boxlib +# from yt.frontends import boxlib from yt.frontends.boxlib.data_structures import AMReXDataset import matplotlib.pyplot as plt -from matplotlib.animation import FuncAnimation -import pandas as pd -import geopandas as gpd -from shapely.geometry import shape +# IF .SHX FILE NEEDS TO BE GENERATED, UNCOMMENT AND CHANGE CODE IN get_gdf() +# import fiona import os import sys @@ -114,24 +120,15 @@ def get_gdf(prefix: str): All files must be present and must begin with prefix """ - # with open(prefix + ".shp", "rb") as shp, \ - # open(prefix + ".dbf", "rb") as dbf, \ - # open(prefix + ".prj", "rb") as prj: - # r = shapefile.Reader(shp=shp, dbf=dbf, prj=prj) - # attributes, geometry = [], [] - # field_names = [field[0] for field in r.fields[1:]] - # for row in r.shapeRecords(): - # geometry.append(shape(row.shape.__geo_interface__)) - # attributes.append(dict(zip(field_names, row.record))) - # r.close() - # gdf = gpd.GeoDataFrame(data = attributes, geometry = geometry) - - # Below code requires a .shx file but seems to be much faster than above - # building .shx file can easily be done for you by running code inside a + # Below code requires a .shx file. Building .shx file + # can be done by running code inside a # with fiona.Env(SHAPE_RESTORE_SHX = "YES"): # block (remember to import fiona explicitly) gdf = gpd.read_file(prefix + ".shp", driver="esri") + # with fiona.Env(SHAPE_RESTORE_SHX = "YES"): + # gdf = gpd.read_file(prefix + ".shp", driver="esri") + cols = list(gdf.columns) # Try to find county code columns. CAPS for CA/US state data, lowercase for BA data From fbaa7481eee856c9024853b8919a57e2096430fe Mon Sep 17 00:00:00 2001 From: arnav-singhal Date: Wed, 7 Aug 2024 16:03:12 -0700 Subject: [PATCH 5/6] add hdf5 aggregation script --- utilities/hdf5_process.py | 202 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 utilities/hdf5_process.py diff --git a/utilities/hdf5_process.py b/utilities/hdf5_process.py new file mode 100644 index 0000000..105c73c --- /dev/null +++ b/utilities/hdf5_process.py @@ -0,0 +1,202 @@ +"""Some processing functions to help aggregate data by county or tract. +If tract-level information is desired, simply replace any b'FIPS' occurrences +with b'Tract' to aggregate by Tract instead of FIPS. + +General useful functions include: + get_comms_from_[pos/coord]: allows conversion of particle position/coordinate + to community index so that FIPS or Tract can be + matched up + fast_get_counties: gets a list of particles' home counties/tracts. + get_home_and_work_counties: gets a list of particles' home and work counties/tracts. + +An example of using these functions to aggregate total infection counts across counties +and age groups is provided in get_age_county_counts and write_all_seeds. +MPI is only needed if writing using MPI, write_all_seeds +is the only function that is dependent, so feel free to comment out. +""" + +import numpy as np +import pandas as pd +from mpi4py import MPI +import h5py +import glob + +def get_comms_from_pos(xs: float, ys: float): + """Get community index from AMReX position values. + These are the first two components in the real data + for every particle.""" + + x_max = np.rint(np.max(1 / xs) / 2) + y_max = np.rint(np.max(1 / ys) / 2) + + adj_xs = np.rint(x_max * xs - 0.5) + adj_ys = np.rint(y_max * ys - 0.5) + return (adj_xs + adj_ys * x_max).astype(int) + +def get_comms_from_coord(i: int, j: int): + """Get community index from home/work_i and _j coordinates. + These data are only written on Day 0, so must be used with + Day 0 data files.""" + + x_max = np.max(i) + 1 + return i + j * x_max + +def comms_to_counties(comms, comm_ref, county_ref): + """Converts community index to county FIPS code. + Inputs: + comms: array of community indices to be converted + comm_ref: array of all community indices, in same order as county_ref + county_ref: array of all FIPS codes, in same order as comm_ref + Output: array of same length as comms, with community indices + replaced by respective FIPS codes""" + + max_comm = np.max(comm_ref) + 1 + ordered_counties = np.zeros(max_comm, dtype=int) + for idx, comm in np.ndenumerate(comm_ref): + if comm != -1: + ordered_counties[comm] = county_ref[idx] + return ordered_counties[comms] + +def fast_get_counties(data_dir : str): + """Get home counties for each particle + Outputs: nd.nparray[int] (list of counties for each particle), + nd.nparray[int] (list of sorted, unique counties)""" + + if data_dir[-1] != "/": + data_dir = data_dir + "/" + + # Day 1 data is used since it is smaller and takes less load time than Day 0 + with h5py.File(data_dir + "plt00001/agents/agents.h5", 'r') as agent_file, \ + h5py.File(data_dir + "plt00001.h5", 'r') as comm_file: + + # Load all particle data + # int_data = agent_file['level_0']['data:datatype=0'][()] \ + # .reshape(-1, agent_file.attrs['num_component_int'][0] + 2) + real_data = agent_file['level_0']['data:datatype=1'][()] \ + .reshape(-1, agent_file.attrs['num_component_real'][0] + 2) + + # Determine indices of components + comm_indices = {} + for i in range(comm_file.attrs['num_components'][0]): + comm_indices[comm_file.attrs['component_' + str(i)]] = str(i) + + counties = np.rint(comm_file['level_0']['data:datatype=' + comm_indices[b'FIPS']][()]).astype(int) + sorted_counties = np.unique(counties) + # if garbage communities exist, will have a county of -1 + sorted_counties = sorted_counties[1:] if sorted_counties[0] == -1 else sorted_counties + + # 0th and 1st index of real data are always x- and y-positions + return comms_to_counties(get_comms_from_pos(real_data[:, 0], real_data[:, 1]), + np.rint(comm_file['level_0']['data:datatype=' + comm_indices[b'comm']][()]).astype(int), + counties), \ + sorted_counties + +def get_home_and_work_counties(data_dir : str): + """Get both home and work counties for each particle. + About 2x slower than fast_get_counties(). + Outputs: nd.nparray[int]: (2, N)-array of both counties for each particle, + nd.nparray[int]: list of sorted, unique counties""" + + if data_dir[-1] != "/": + data_dir = data_dir + "/" + with h5py.File(data_dir + "plt00000/agents/agents.h5", 'r') as agent_file, \ + h5py.File(data_dir + "plt00000.h5", 'r') as comm_file: + + # Load all int particle data + int_data = agent_file['level_0']['data:datatype=0'][()] \ + .reshape(-1, agent_file.attrs['num_component_int'][0] + 2) + # real_data = agent_file['level_0']['data:datatype=1'][()] \ + # .reshape(-1, agent_file.attrs['num_component_real'][0] + 2) + + # Determine indices of components + comm_indices = {} + for i in range(comm_file.attrs['num_components'][0]): + comm_indices[comm_file.attrs['component_' + str(i)]] = str(i) + + int_indices = {} + for i in range(2, agent_file.attrs['num_component_int'][0] + 2): + int_indices[agent_file.attrs['int_component_' + str(i)]] = i + + counties = np.rint(comm_file['level_0']['data:datatype=' + comm_indices[b'FIPS']][()]).astype(int) + sorted_counties = np.unique(counties) + sorted_counties = sorted_counties[1:] if sorted_counties[0] == -1 else sorted_counties + + return comms_to_counties(np.vstack((get_comms_from_coord(int_data[:, int_indices[b'home_i']], + int_data[:, int_indices[b'home_j']]), + get_comms_from_coord(int_data[:, int_indices[b'work_i']], + int_data[:, int_indices[b'work_j']]))), + np.rint(comm_file['level_0']['data:datatype=' + comm_indices[b'comm']][()]).astype(int), + counties), sorted_counties + +def get_age_county_counts(data_dir: str = "sim/180_run", days: int = 100): + """An example function that aggregates infection counts across counties and age group. + Can be customized by changing what field is being matched, or by changing + get_counties to match census tract instead of FIPS code, etc. + Inputs: data_dir: data directory + days: number of days to aggregate + Output: (days, 5, # of counties)-array with summed counts. + """ + data_dir = data_dir[:-1] if data_dir[-1] == "/" else data_dir + + # Get counties of each particle + particle_counties, sorted_counties = get_counties(data_dir) + + # Set up output array: day x age group x county + out = np.zeros((days, 5, len(sorted_counties)), dtype=np.float32) + + for day in range(days): + with h5py.File(f"{data_dir}/plt{day:05}/agents/agents.h5", 'r') as agent_file: + # print(f"Starting Day {day}") + int_indices = {} + for i in range(2, agent_file.attrs['num_component_int'][0] + 2): + int_indices[agent_file.attrs['int_component_' + str(i)]] = i + + int_data = agent_file['level_0']['data:datatype=0'][()] \ + .reshape(-1, agent_file.attrs['num_component_int'][0] + 2) + + if day == 0: + # Generate masks for each age group + ages = int_data[:, int_indices[b'age_group']] + agerange = np.arange(5).reshape(5, 1) + age_masks = ages == agerange + + # print("Aggregating\n") + for age in range(5): + # Get an array of counties that match status == 1 and the corresponding age group + infected_counties = particle_counties[(int_data[:, int_indices[b'status']] == 1) & age_masks[age]] + + # Count up number of each county + counts = pd.Series(infected_counties).value_counts() + + # Find index for the counted counties in the sorted_counties order + # This makes sure if any counties are missing that all counties are slotted + # correctly. + present_counties = np.searchsorted(sorted_counties, counts.index.values) + + # Set corresponding slice of output array to the counts + out[day, age, present_counties] = counts.values + return out + +def write_all_seeds(days: int, num_age_groups: 5, num_counties: 9): + """Write each seed's age/county counts to an HDF5 file, using MPI + to parallelize.""" + + comm = MPI.COMM_WORLD + mpi_size = comm.Get_size() + rank = comm.Get_rank() + + # We assume every seed's data is in a folder named seed[xxx]. + seeds = glob.glob("seed*") + seed_size = len(seeds) + block_size = np.ceil(seed_size / mpi_size).astype(int) + + with h5py.File("county_data.h5", "w", libver="latest", driver="mpio", comm=comm) as f: + ds = f.create_dataset("data", (seed_size, days, 5, 1404)) + + for i in range(block_size): + seed_idx = rank * block_size + i + if seed_idx >= seed_size: + break + # print(f"Rank {rank} / {size}: doing {seeds[seed_idx]}, {i}/{block_size}", flush=True) + ds[seed_idx] = get_age_county_counts(seeds[seed_idx], days) + # print(f"Rank {rank} / {size}: done", flush=True) From 672f51439611d9f588b0dcc3692ea1e194981d02 Mon Sep 17 00:00:00 2001 From: Arnav Singhal Date: Wed, 7 Aug 2024 16:06:19 -0700 Subject: [PATCH 6/6] add info on hdf5_process to docs --- docs/source/usage/utilities.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/source/usage/utilities.rst b/docs/source/usage/utilities.rst index 982448b..c2b67f1 100644 --- a/docs/source/usage/utilities.rst +++ b/docs/source/usage/utilities.rst @@ -178,3 +178,11 @@ either native AMReX output or HDF5 output, plotting infections (or an arbitrary function on community-level data) spatially across a given map (as a .shx file). Some of these shapefiles (for USA, California, and the Bay Area) are located in the ``data/`` directory. See the script file and its comments for further details. + +Data Processing +--------------- +To aggregate data written to HDF5 output, whether by county or census tract, +we have some functions designed to aid processing in ``utilities/hfd5_process.py``. +These functions allow grouping of data and counting various conditions of individuals, +such as counting total number of infections in each census tract in each age group +over all days in a simulation. See the script file and its comments for further details. \ No newline at end of file