diff --git a/SNR/hdf5_rtplot_utils.py b/SNR/hdf5_rtplot_utils.py index e074f7f..e872440 100644 --- a/SNR/hdf5_rtplot_utils.py +++ b/SNR/hdf5_rtplot_utils.py @@ -37,7 +37,7 @@ import matplotlib.pyplot as plt import numpy as np import os -import deepdish as dd +import h5py from pydarnio import BorealisRead from multiprocessing import Process @@ -120,7 +120,7 @@ def plot_unaveraged_range_time_data(data_array, num_sequences_array, timestamps_ new_power_array = np.transpose(power_array) kw = {'width_ratios': [95, 5], 'height_ratios': [1, 3]} - fig, ((ax1, cax1), (ax2, cax2)) = plt.subplots(2, 2, figsize=figsize, gridspec_kw=kw) + fig, ((ax1, cax1), (ax2, cax2)) = plt.subplots(2, 2, figsize=figsize, gridspec_kw=kw, sharex='col') fig.suptitle(f'{dataset_descriptor} Raw Power Sequence Time {start_time.strftime("%Y%m%d")} ' f'{start_time.strftime("%H:%M:%S")} to {end_time.strftime("%H:%M:%S")} UT vs Range') @@ -138,7 +138,6 @@ def plot_unaveraged_range_time_data(data_array, num_sequences_array, timestamps_ fig.colorbar(img, cax=cax2, label='Raw Power (dB)') cax1.axis('off') - ax2.sharex(ax1) print(plot_filename) plt.savefig(plot_filename) plt.close() @@ -202,7 +201,7 @@ def plot_antennas_range_time(antennas_iq_file, antenna_nums=None, num_processes= if is_site_file: arrays, antenna_names, antenna_indices = antennas_iq_site_to_array(antennas_iq_file, antenna_nums) else: - reader = BorealisRead(antennas_iq_file, 'antennas_iq', 'array') + reader = BorealisRead(antennas_iq_file, 'antennas_iq') arrays = reader.arrays (num_records, num_antennas, max_num_sequences, num_samps) = arrays['data'].shape @@ -458,7 +457,7 @@ def plot_averaged_range_time_data(data_array, timestamps_array, dataset_descript end_time = datetime.datetime.utcfromtimestamp(timestamps[-1]) kw = {'width_ratios': [95, 5], 'height_ratios': [1, 3]} - fig, ((ax1, cax1), (ax2, cax2)) = plt.subplots(2, 2, figsize=figsize, gridspec_kw=kw) + fig, ((ax1, cax1), (ax2, cax2)) = plt.subplots(2, 2, figsize=figsize, gridspec_kw=kw, sharex='col') fig.suptitle(f'{dataset_descriptor} PWR Time {start_time.strftime("%Y%m%d")} {start_time.strftime("%H:%M:%S")} to ' f'{end_time.strftime("%H:%M:%S")} UT vs Range') @@ -475,7 +474,6 @@ def plot_averaged_range_time_data(data_array, timestamps_array, dataset_descript fig.colorbar(img, cax=cax2, label='Raw Power (dB)') cax1.axis('off') - ax2.get_shared_x_axes().join(ax1, ax2) print(plot_filename) plt.savefig(plot_filename) plt.close() @@ -670,17 +668,17 @@ def plot_rawrf_data(rawrf_file, antenna_nums=None, num_processes=3, sequence_num time_of_plot = '.'.join(basename.split('.')[0:6]) - records = dd.io.load(rawrf_file) - record_keys = sorted(list(records.keys())) + f = h5py.File(rawrf_file, 'r') + record_keys = sorted(list(f.keys())) - record = records[record_keys[0]] + record = f[record_keys[0]] # This little hack was made to deal with rawrf files afflicted by Issue #258 on the Borealis GitHub, which # has since been solved. It should work for all rawrf files regardless. - num_sequences, num_antennas, num_samps = record['data_dimensions'] + num_sequences, num_antennas, num_samps = record.attrs['data_dimensions'] total_samples = record['data'].size sequences_stored = int(total_samples / num_samps / num_antennas) - data = record['data'].reshape((sequences_stored, num_antennas, num_samps)) + data = record['data'][()].reshape((sequences_stored, num_antennas, num_samps)) # typically, antenna names and antenna indices are the same except # where certain antennas were skipped in data writing for any reason. @@ -768,7 +766,7 @@ def plot_iq_data(voltage_samples, timestamps_array, dataset_descriptor, plot_fil start_time = datetime.datetime.utcfromtimestamp(timestamps_array[0]) end_time = datetime.datetime.utcfromtimestamp(timestamps_array[-1]) - fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize) + fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize, sharex='col') fig.suptitle(f'{dataset_descriptor} Raw Voltage Sequence Time {start_time.strftime("%Y%m%d")} ' f'{start_time.strftime("%H:%M:%S")} to {end_time.strftime("%H:%M:%S")} UT') @@ -783,7 +781,6 @@ def plot_iq_data(voltage_samples, timestamps_array, dataset_descriptor, plot_fil ax2.plot(np.arange(0, len(voltage_samples[0, :]))/sample_rate*1e6, 10 * np.log10(np.abs(voltage_samples[0, :]))) ax2.set_ylabel('Power (dB)') ax2.set_xlabel('Time (us)') - ax2.get_shared_x_axes().join(ax1, ax2) plot_name = plot_filename_prefix + '_time.jpg' print(plot_name) diff --git a/SNR/minimal_restructuring.py b/SNR/minimal_restructuring.py index e8cb14a..a92e76e 100644 --- a/SNR/minimal_restructuring.py +++ b/SNR/minimal_restructuring.py @@ -7,10 +7,8 @@ Functions --------- """ -import copy +import h5py import numpy as np -import os -import deepdish as dd def antennas_iq_site_to_array(antennas_iq_file, antenna_nums=None): @@ -37,48 +35,48 @@ def antennas_iq_site_to_array(antennas_iq_file, antenna_nums=None): # Dictionary of fields that will be returned arrays = {} - group = dd.io.load(antennas_iq_file) - timestamps = sorted(list(group.keys())) + with h5py.File(antennas_iq_file, 'r') as f: + timestamps = sorted(list(f.keys())) - antenna_arrays_order = group[timestamps[0]]['antenna_arrays_order'] - arrays['antenna_arrays_order'] = antenna_arrays_order + antenna_arrays_order = f[timestamps[0]]['antenna_arrays_order'][:] + arrays['antenna_arrays_order'] = antenna_arrays_order - num_records = len(timestamps) - num_sequences_array = np.zeros(num_records, dtype=np.uint32) + num_records = len(timestamps) + num_sequences_array = np.zeros(num_records, dtype=np.uint32) - # typically, antenna names and antenna indices are the same except - # where certain antennas were skipped in data writing for any reason. - if antenna_nums is None or len(antenna_nums) == 0: - antenna_indices = list(range(0, antenna_arrays_order.size)) - antenna_names = list(antenna_arrays_order) - else: - antenna_indices = [list(antenna_arrays_order).index('antenna_' + str(i)) for i in antenna_nums] - antenna_names = [f'antenna_{a}' for a in antenna_nums] + # typically, antenna names and antenna indices are the same except + # where certain antennas were skipped in data writing for any reason. + if antenna_nums is None or len(antenna_nums) == 0: + antenna_indices = list(range(0, antenna_arrays_order.size)) + antenna_names = list(antenna_arrays_order) + else: + antenna_indices = [list(antenna_arrays_order).index(bytes(f'antenna_{i}', 'utf-8')) for i in antenna_nums] + antenna_names = [f'antenna_{a}' for a in antenna_nums] - # Get the maximum number of sequences for a record in the file - max_sequences = 0 - for i, timestamp in enumerate(timestamps): - record = group[timestamp] - _, num_seqs, _ = record['data_dimensions'] - num_sequences_array[i] = num_seqs - if num_seqs > max_sequences: - max_sequences = num_seqs + # Get the maximum number of sequences for a record in the file + max_sequences = 0 + for i, timestamp in enumerate(timestamps): + record = f[timestamp] + _, num_seqs, _ = record['data_dimensions'][:] + num_sequences_array[i] = num_seqs + if num_seqs > max_sequences: + max_sequences = num_seqs - # We will need these dimensions for pre-allocating arrays - data_dimensions = group[timestamps[0]]['data_dimensions'] + # We will need these dimensions for pre-allocating arrays + data_dimensions = f[timestamps[0]]['data_dimensions'][:] - # Create arrays for each of the needed fields - timestamps_data = np.zeros((num_records, max_sequences), dtype=np.float64) - data_array = np.zeros((num_records, len(antenna_indices), max_sequences, data_dimensions[2]), dtype=np.complex64) + # Create arrays for each of the needed fields + timestamps_data = np.zeros((num_records, max_sequences), dtype=np.float64) + data_array = np.zeros((num_records, len(antenna_indices), max_sequences, data_dimensions[2]), dtype=np.complex64) - # Copy the relevant data into the arrays - for i, timestamp in enumerate(timestamps): - record = group[timestamp] - num_antennas, num_seqs, num_samps = record['data_dimensions'] - timestamps_data[i, :num_seqs] = record['sqn_timestamps'] - data = record['data'].reshape(record['data_dimensions']) - data_array[i, :, :num_seqs, :] = \ - data[antenna_indices, ...] + # Copy the relevant data into the arrays + for i, timestamp in enumerate(timestamps): + record = f[timestamp] + num_antennas, num_seqs, num_samps = record['data_dimensions'][:] + timestamps_data[i, :num_seqs] = record['sqn_timestamps'][:] + data = record['data'][:].reshape((num_antennas, num_seqs, num_samps)) + data_array[i, :, :num_seqs, :] = \ + data[antenna_indices, ...] # Copy to dictionary and return arrays['data'] = data_array diff --git a/borealis_gaps.py b/borealis_gaps.py index d1732c4..af12ea9 100755 --- a/borealis_gaps.py +++ b/borealis_gaps.py @@ -15,142 +15,40 @@ """ import argparse -import bz2 import datetime -import deepdish import glob -import math -import numpy as np +from multiprocessing import Manager, Process import os -import subprocess -import sys -import time - -from multiprocessing import Pool, Queue, Manager, Process - -try: - from pydarnio import BorealisRead -except ModuleNotFoundError as err: - raise ModuleNotFoundError('Please source a pydarnio virtual environment!') from err - -def usage_msg(): - """ - Return the usage message for this process. - - This is used if a -h flag or invalid arguments are provided. - - Returns - ------- - usage_message: str - The usage message on how to use this - """ - - usage_message = """ borealis_gaps.py [-h] data_dir start_day end_day - - Pass in the raw data directory that you want to check for borealis gaps. This script uses - multiprocessing to check for gaps in the hdf5 files of each day and gaps between the days. - - This script will use the find command to find files from the specified days in the given - data directory. - """ - - return usage_message - - -def borealis_gaps_parser(): - parser = argparse.ArgumentParser(usage=usage_msg()) - parser.add_argument("data_dir", help="Path to the directory that holds any directory structure which within " - "contains all *[filetype].hdf5 or .hdf5.site.bz2 files from the dates " - "you wish to get downtimes. Array files (.hdf5) should exist if array is " - "the specified file structure, otherwise .site should exist.") - parser.add_argument("start_day", help="First day to check, given as YYYYMMDD.") - parser.add_argument("end_day", help="Last day to check, given as YYYYMMDD.") - parser.add_argument("--filetype", help="The filetype that you want to check gaps in (bfiq or " - "rawacf typical). Default 'rawacf'") - parser.add_argument("--gap_spacing", help="The gap spacing that you wish to check the file" - " for, in seconds. Default 120s.") - parser.add_argument("--num_processes", help="The number of processes to use in the multiprocessing," - " default 4.") - parser.add_argument("--file_structure", help="The file structure to use when reading the files, " - " default 'array' but can also be 'site'") - parser.add_argument("--gaps_table_file", help="The pathname of the file to print the gaps table " - "to, default is placed in $HOME/borealis_gaps/") - return parser - - -def decompress_bz2(filename): - """ - Decompresses a bz2 file and returns the decompressed filename. - - Parameters - ---------- - filename - bz2 filename to decompress - - Returns - ------- - newfilepath - filename of decompressed file now in the path - """ - basename = os.path.basename(filename) - newfilepath = os.path.dirname(filename) + '/' + '.'.join(basename.split('.')[0:-1]) # all but bz2 - with open(newfilepath, 'wb') as new_file, bz2.BZ2File(filename, 'rb') as file: - for data in iter(lambda : file.read(100 * 1024), b''): - new_file.write(data) - - return newfilepath +import h5py +import numpy as np -def get_record_timestamps(filename, record_dict, filetype, file_structure='array'): +def get_record_timestamps(filename, record_dict): """ Get the record timestamps from a file. These are what are used to determine the gaps. - Also decompresses if the file is a bzip2 file before the read. - Parameters ---------- - filename + filename: str Filename to retrieve timestamps from. - record_dict + record_dict: dict record_dictionary to append the entry filename: timestamps list - file_structure - File structure of file provided. Default array, but can also - be site structured. Determines how to retrieve the timestamps - of the records. """ print('Getting timestamps from file : ' + filename) - if file_structure == 'site': - if os.path.basename(filename).split('.')[-1] in ['bz2', 'bzip2']: - borealis_hdf5_file = decompress_bz2(filename) - bzip2 = True + with h5py.File(filename, 'r') as f: + recs = sorted(list(f.keys())) + if "sqn_timestamps" in recs: + sqn_timestamps = f["sqn_timestamps"][:, 0] else: - borealis_hdf5_file = filename - bzip2 = False - - reader = BorealisRead(filename, filetype, file_structure) - # get all records first timestamp - records = [] - for record_name in reader.record_names: - records.append(reader.records[record_name]['sqn_timestamps'][0]) - - if bzip2: - if borealis_hdf5_file != filename: - # if the original was bzipped the borealis_hdf5_file used will have a different name from original. - os.remove(borealis_hdf5_file) - else: - print('Warning: attempted remove of original file {}'.format(borealis_hdf5_file)) - elif file_structure == 'array': - # get first timestamp per record in sqn_timestamps array of num_records x num_sequences - reader = BorealisRead(filename, filetype, file_structure) - records = reader.arrays['sqn_timestamps'][:,0] # all records first timestamp - else: - raise Exception('Invalid file structure provided') + sqn_timestamps = [] + for r in recs: + rec = f[r] + sqn_timestamps.append(rec["sqn_timestamps"][0]) + sqn_timestamps = np.concatenate(sqn_timestamps) - inner_dict1 = record_dict - inner_dict1[filename] = records - record_dict = inner_dict1 # reassign + record_dict[filename] = sqn_timestamps def combine_timestamp_lists(record_dict): @@ -159,7 +57,7 @@ def combine_timestamp_lists(record_dict): Parameters ---------- - record_dict + record_dict: dict Dictionary of filename: list of timestamps within file. Typically contains all files from within a single day. @@ -247,7 +145,6 @@ def check_for_gaps_between_days(timestamps_dict, gap_spacing, gaps_dict): first_record = sorted_timestamps[0] last_record = sorted_timestamps[-1] start_time = datetime.datetime.utcfromtimestamp(float(first_record)) - end_time = datetime.datetime.utcfromtimestamp(float(last_record)) if start_time > previous_end_time + datetime.timedelta(seconds=float(gap_spacing)): # append gap to this day's list of gaps. Dict key is day, list is list of (gap_start, gap_end) if day not in gaps_dict.keys(): @@ -306,9 +203,7 @@ def print_gaps(gaps_dict, first_timestamp, last_timestamp, gap_spacing, print_fi gap_duration = gap_end_time - gap_start_time duration = gap_duration.total_seconds() duration_min = round(duration/60.0, 1) - print(gap_start_time.strftime(strf_format) + ' ,' + - gap_end_time.strftime(strf_format) + ' ,' + - str(duration_min) + ',,', file=f) + print(f"{gap_start_time.strftime(strf_format)} ,{gap_end_time.strftime(strf_format)} ,{duration_min:.1f},,", file=f) duration_dict[day] += duration_min # end table, print new line @@ -321,39 +216,64 @@ def print_gaps(gaps_dict, first_timestamp, last_timestamp, gap_spacing, print_fi print('TOTAL DOWNTIME DURATION IN PERIOD from {} to {}: ,\n'.format( first_timestamp.strftime(strf_format), last_timestamp.strftime(strf_format)), file=f) - print('{} minutes,\n'.format(total_duration_min), file=f) - print('{} hours,\n'.format(total_duration_hrs), file=f) - print('{} days,\n'.format(total_duration_days), file=f) + print(f'{total_duration_min} minutes,\n', file=f) + print(f'{total_duration_hrs} hours,\n', file=f) + print(f'{total_duration_days} days,\n', file=f) + # Print the results to screen print(' ') - subprocess.call(['cat', print_filename]) + with open(print_filename, 'r') as f: + lines = f.readlines() + for line in lines: + print(line.strip()) -if __name__ == '__main__': - parser = borealis_gaps_parser() - args = parser.parse_args() +def usage_msg(): + """ + Return the usage message for this process. - if args.gap_spacing is None: - gap_spacing = 120 # s - else: - gap_spacing = float(args.gap_spacing) + This is used if a -h flag or invalid arguments are provided. - if args.filetype is None: - filetype = 'rawacf' - else: - filetype = args.filetype + Returns + ------- + usage_message: str + The usage message on how to use this + """ - if args.num_processes is None: - num_processes = 4 - else: - num_processes = int(args.num_processes) + usage_message = """ borealis_gaps.py [-h] data_dir start_day end_day - if args.file_structure is None: - file_structure = 'array' - file_extension = '.hdf5' - else: - file_structure = args.file_structure - file_extension = '.hdf5.site.bz2' + Pass in the raw data directory that you want to check for borealis gaps. This script uses + multiprocessing to check for gaps in the hdf5 files of each day and gaps between the days. + + This script will use the find command to find files from the specified days in the given + data directory. + """ + + return usage_message + + +def borealis_gaps_parser(): + parser = argparse.ArgumentParser(usage=usage_msg()) + parser.add_argument("data_dir", + help="Path to the directory that holds any directory structure which within contains all " + "*[filetype].hdf5 or .hdf5.site files from the dates you wish to get downtimes.") + parser.add_argument("start_day", help="First day to check, given as YYYYMMDD.") + parser.add_argument("end_day", help="Last day to check, given as YYYYMMDD.") + parser.add_argument("--filetype", default="rawacf", + help="The filetype that you want to check gaps in (bfiq or rawacf typical). Default 'rawacf'") + parser.add_argument("--gap_spacing", type=float, default=120.0, + help="The gap spacing that you wish to check the file for, in seconds. Default 120s.") + parser.add_argument("--num_processes", type=int, default=4, + help="The number of processes to use in the multiprocessing, default 4.") + parser.add_argument("--gaps_table_file", + help="The pathname of the file to print the gaps table to, default is placed in " + "$HOME/borealis_gaps/") + return parser + + +if __name__ == '__main__': + parser = borealis_gaps_parser() + args = parser.parse_args() data_dir = args.data_dir if data_dir[-1] != '/': @@ -362,14 +282,28 @@ def print_gaps(gaps_dict, first_timestamp, last_timestamp, gap_spacing, print_fi lowest_dir = data_dir.split('/')[-2] # now has / at the end so must be second last element if args.gaps_table_file is None: - print_filename = os.environ['HOME'] + '/borealis_gaps/' + args.start_day + '_' + args.end_day + '_' + lowest_dir + '_gaps.csv' + print_filename = f"{os.environ['HOME']}/borealis_gaps/{args.start_day}_{args.end_day}_{lowest_dir}_gaps.csv" else: - print_filename = gaps_table_file + print_filename = args.gaps_table_file + + print_dir = os.path.dirname(print_filename) + if not os.path.isdir(print_dir): + raise OSError(f"Directory {print_dir} does not exist") files = [] - start_day = datetime.datetime(year=int(args.start_day[0:4]), month=int(args.start_day[4:6]), day=int(args.start_day[6:8]), tzinfo=datetime.timezone.utc) - end_day = datetime.datetime(year=int(args.end_day[0:4]), month=int(args.end_day[4:6]), day=int(args.end_day[6:8]), tzinfo=datetime.timezone.utc) + start_day = datetime.datetime( + year=int(args.start_day[0:4]), + month=int(args.start_day[4:6]), + day=int(args.start_day[6:8]), + tzinfo=datetime.timezone.utc + ) + end_day = datetime.datetime( + year=int(args.end_day[0:4]), + month=int(args.end_day[4:6]), + day=int(args.end_day[6:8]), + tzinfo=datetime.timezone.utc + ) # this dictionary will be day: list of sorted timestamps in the day timestamps_dict = {} @@ -382,15 +316,12 @@ def print_gaps(gaps_dict, first_timestamp, last_timestamp, gap_spacing, print_fi for one_day in daterange(start_day, end_day): # Get all the filenames and then all the timestamps for this day. - print(one_day.strftime("%Y%m%d")) + year = one_day.strftime("%Y") + month = one_day.strftime("%m") + day = one_day.strftime("%d") + print(f"{year}{month}{day}") - files = subprocess.check_output(['find', data_dir, '-name', - one_day.strftime("%Y%m%d")+'*.' + filetype + '*' + - file_extension]).splitlines() - # if os.path.isdir(data_dir + one_day.strftime("%Y%m%d")): - # files = glob.glob(data_dir + one_day.strftime("%Y%m%d") + '/*.' + filetype + '*') - # else: - # continue + files = sorted(glob.glob(f"{data_dir}/{year}/{month}/{year}{month}{day}*{args.filetype}.hdf5*")) jobs = [] files_left = True @@ -399,27 +330,26 @@ def print_gaps(gaps_dict, first_timestamp, last_timestamp, gap_spacing, print_fi manager1 = Manager() filename_dict = manager1.dict() + files_this_day = False while files_left: - for procnum in range(num_processes): + for procnum in range(args.num_processes): try: - filename = files[filename_index + procnum].decode('ascii') + filename = files[filename_index + procnum] except IndexError: if filename_index + procnum == 0: - print('No files found for date {}'.format( - one_day.strftime("%Y%m%d"))) + print(f"No files found for date {one_day.strftime('%Y%m%d')}") files_this_day = False files_left = False break files_this_day = True - p = Process(target=get_record_timestamps, args=(filename, filename_dict, filetype, file_structure)) - #p = Process(target=check_for_gaps_in_file, args=(filename, gap_spacing, gaps_dict, file_duration_dict)) + p = Process(target=get_record_timestamps, args=(filename, filename_dict)) jobs.append(p) p.start() for proc in jobs: proc.join() - filename_index += num_processes + filename_index += args.num_processes if files_this_day: record_dict[one_day] = filename_dict @@ -428,27 +358,26 @@ def print_gaps(gaps_dict, first_timestamp, last_timestamp, gap_spacing, print_fi if one_day == start_day: first_timestamp = start_day.timestamp() if one_day in timestamps_dict.keys(): - timestamps_dict[one_day].insert(0,first_timestamp) + timestamps_dict[one_day].insert(0, first_timestamp) else: timestamps_dict[one_day] = [first_timestamp] - elif one_day == end_day: + if one_day == end_day: last_timestamp = (end_day+datetime.timedelta(seconds=59, minutes=59, hours=23)).timestamp() if one_day in timestamps_dict.keys(): timestamps_dict[one_day].append(last_timestamp) else: timestamps_dict[one_day] = [last_timestamp] - if one_day in timestamps_dict.keys(): # first or last day, or files were found for the day - gaps_dict[one_day] = check_for_gaps_between_records(timestamps_dict[one_day], gap_spacing) - #print(gaps_dict[one_day]) + if one_day in timestamps_dict.keys(): # first or last day, or files were found for the day + gaps_dict[one_day] = check_for_gaps_between_records(timestamps_dict[one_day], args.gap_spacing) # now that gaps_dict is entirely filled with each day in the range, find gaps between days - gaps_dict = check_for_gaps_between_days(timestamps_dict, gap_spacing, gaps_dict) + gaps_dict = check_for_gaps_between_days(timestamps_dict, args.gap_spacing, gaps_dict) sorted_days = sorted(timestamps_dict.keys()) # first timestamp is first day's first timestamp first_timestamp = datetime.datetime.utcfromtimestamp(float(sorted(timestamps_dict[sorted_days[0]])[0])) # last timestamp is last day's last timestamp last_timestamp = datetime.datetime.utcfromtimestamp(float(sorted(timestamps_dict[sorted_days[-1]])[-1])) - print_gaps(gaps_dict, first_timestamp, last_timestamp, gap_spacing, print_filename) + print_gaps(gaps_dict, first_timestamp, last_timestamp, args.gap_spacing, print_filename)