NOAA-GFDL · Ciheim · Aug 22, 2024 · Aug 17, 2024 · Aug 17, 2024 · Aug 17, 2024
diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py
@@ -74,7 +74,6 @@ def getInfoFromFilename(filename,dictInfo,logger):
         ncfilename = filename.split(".")[0].split("_")
         varname = ncfilename[0]
         dictInfo["variable"] = varname
-        miptable = ncfilename[1]
         dictInfo["mip_table"] = miptable
         modelname = ncfilename[2]
         dictInfo["model"] = modelname
@@ -250,7 +249,11 @@ def getStandardName(list_variable_id,list_realm):
   dictCF = {}
   try:
       url = "https://raw.githubusercontent.com/NOAA-GFDL/MDTF-diagnostics/b5e7916c203f3ba0b53e9e40fb8dc78ecc2cf5c3/data/gfdl-cmor-tables/gfdl_to_cmip5_vars.csv"
-      df = pd.read_csv(url, sep=",", header=0,index_col=False)
+      url2 = "https://raw.githubusercontent.com/NOAA-GFDL/MDTF-diagnostics/b5e7916c203f3ba0b53e9e40fb8dc78ecc2cf5c3/data/gfdl-cmor-tables/gfdl_to_cmip6_vars.csv"
+      df1 = pd.read_csv(url, sep=",", header=0,index_col=False)
+      df2 = pd.read_csv(url2, sep=",", header=0,index_col=False)
+      #TODO Add try catch except for concat operation if concat fails for some reason 
+      df = pd.concat([df1,df2]).drop_duplicates().reset_index(drop=True)
   except IOError:
             print("Unable to open file")
             sys.exit(1)

diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py
@@ -68,6 +68,11 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
                if not filename.endswith(".nc"):
                    logger.debug("FILE does not end with .nc. Skipping", filepath)
                    continue
+               #if our filename expectations are not met compared to the output_file_path_template in config, skip the loop. TODO revisit for statics
+               if ("static" not in filename):
+                 if ((len(filename.split('.'))-1) != len(set_ftemplate)):
+                   print("Skipping ",filename)
+                   continue 
                logger.info(dirpath+"/"+filename)
                dictInfo = {}
                dictInfo = getinfo.getProject(projectdir, dictInfo)

diff --git a/configs/config-CM4.5v01_om5b06_piC_noBLING.yaml b/configs/config-CM4.5v01_om5b06_piC_noBLING.yaml
@@ -0,0 +1,41 @@
+#what kind of directory structure to expect? 
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+#catalog headers
+#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#with the ESM collection specification standards and the appropriate workflows.
+
+headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
+                  "frequency", "realm", "table_id",
+                  "member_id", "grid_label", "variable_id",
+                  "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]
+
+#what kind of directory structure to expect?
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
+
+output_file_template: ['realm','time_range','variable_id']
+
+#OUTPUT FILE INFO is currently passed as command-line argument.
+#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
+#csvfile =  #jsonfile =  #logfile =
+
+#######################################################
+
+input_path:  "/archive/John.Krasting/fre/FMS2024.02_OM5_20240724/CM4.5v01_om5b06_piC_noBLING/gfdl.ncrc5-intel23-prod-openmp/pp/"
+output_path: "/home/a1r/github/noaa-gfdl/catalogs/CM4.5v01_om5b06_piC_noBLING" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
diff --git a/configs/config-CM4_piControl_C.yaml b/configs/config-CM4_piControl_C.yaml
@@ -0,0 +1,42 @@
+#what kind of directory structure to expect? 
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+#catalog headers
+#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#with the ESM collection specification standards and the appropriate workflows.
+
+headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
+                  "frequency", "realm", "table_id",
+                  "member_id", "grid_label", "variable_id",
+                  "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]
+
+#what kind of directory structure to expect?
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+#"/archive/oar.gfdl.cmip6/CM4/warsaw_201710_om4_v1.0.1/CM4_piControl_C/gfdl.ncrc4-intel16-prod-openmp/pp
+output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
+
+output_file_template: ['realm','time_range','variable_id']
+
+#OUTPUT FILE INFO is currently passed as command-line argument.
+#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
+#csvfile =  #jsonfile =  #logfile =
+
+#######################################################
+
+input_path: "/archive/oar.gfdl.cmip6/CM4/warsaw_201710_om4_v1.0.1/CM4_piControl_C/gfdl.ncrc4-intel16-prod-openmp/pp" #"/uda/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/"
+output_path: "/home/a1r/github/noaa-gfdl/catalogs/CM4_piControl_C"
diff --git a/configs/config-ESM4.5v01_om5b04_piC.yaml b/configs/config-ESM4.5v01_om5b04_piC.yaml
@@ -0,0 +1,41 @@
+#what kind of directory structure to expect? 
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+#catalog headers
+#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#with the ESM collection specification standards and the appropriate workflows.
+
+headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
+                  "frequency", "realm", "table_id",
+                  "member_id", "grid_label", "variable_id",
+                  "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]
+
+#what kind of directory structure to expect?
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
+
+output_file_template: ['realm','time_range','variable_id']
+
+#OUTPUT FILE INFO is currently passed as command-line argument.
+#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
+#csvfile =  #jsonfile =  #logfile =
+
+#######################################################
+
+input_path:  "/archive/Eric.Stofferahn/CMIP7/ESM4/DEV/ESM4.5v01_om5b04_piC/gfdl.ncrc5-intel23-prod-openmp/pp/" 
+output_path: "/home/a1r/github/noaa-gfdl/catalogs/ESM4.5v01_om5b04_piC" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)