skip unneeded variables

As agreed with UNIVIE. Also use longitudes and latitudes from header and to from station_configuration
ecmwf-projects · Aug 14, 2024 · 99c68c8 · 99c68c8
1 parent 60afd48
commit 99c68c8
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 17 deletions.
diff --git a/cdsobs/data/insitu-comprehensive-upper-air-observation-network/service_definition.yml b/cdsobs/data/insitu-comprehensive-upper-air-observation-network/service_definition.yml
@@ -199,5 +199,5 @@ sources:
       - desroziers_30_uncertainy
       group_name: uncertainty
     space_columns:
-      y: latitude|station_configuration
-      x: longitude|station_configuration
+      y: latitude|header_table
+      x: longitude|header_table
diff --git a/cdsobs/ingestion/api.py b/cdsobs/ingestion/api.py
@@ -1,6 +1,6 @@
 from importlib import import_module
 from pathlib import Path
-from typing import List, Tuple
+from typing import List
 
 import numpy
 import pandas
@@ -151,20 +151,12 @@ def cast_to_descriptions(
     return data_renamed
 
 
-def _get_latlon_names(data: pandas.DataFrame) -> Tuple[str, str]:
-    if "longitude" in data:
-        latname = "latitude"
-        lonname = "longitude"
-    else:
-        latname = "latitude|station_configuration"
-        lonname = "longitude|station_configuration"
-    return latname, lonname
-
-
 def sort(partition: DatasetPartition) -> DatasetPartition:
     """Sort data of a partition."""
     logger.info("Sorting partition data")
-    latname, lonname = _get_latlon_names(partition.data)
+    space_columns = partition.dataset_metadata.space_columns
+    latname = space_columns.y
+    lonname = space_columns.x
     partition.data.sort_values(
         by=["report_timestamp", latname, lonname], kind="mergesort", inplace=True
     )

diff --git a/cdsobs/ingestion/readers/cuon.py b/cdsobs/ingestion/readers/cuon.py
@@ -131,6 +131,8 @@ def _process_table(
             "eda_spread@errstat",
             "processing_level",
             "location_method",
+            "source_id",
+            "crs",
         ]
         file_vars = [
             fv
@@ -141,7 +143,11 @@ def _process_table(
             logger.debug(f"Reading variable {variable}")
             selector = slices[variable]
             # dropping string dims - not necessary for dataframes
-            fields = [f for f in hfile[table_name] if "string" not in f]
+            fields = [
+                f
+                for f in hfile[table_name]
+                if "string" not in f and f not in vals_to_exclude
+            ]
             data: dict[str, numpy.ndarray] = {
                 field: _get_field_data(field, hfile, selector, table_name)
                 for field in fields
@@ -409,7 +415,13 @@ def _fix_table_data(
 ):
     # the name in station_configuration
     if table_name == "header_table":
-        vars_to_drop = ["station_name", "platform_sub_type", "platform_type"]
+        vars_to_drop = [
+            "station_name",
+            "platform_sub_type",
+            "platform_type",
+            "station_type",
+            "crs",
+        ]
         table_data = table_data.drop(vars_to_drop, axis=1, errors="ignore")
     # Check that observation id is unique and fix if not
     if table_name == "observations_table":
@@ -432,7 +444,7 @@ def _fix_table_data(
         table_data = table_data.drop_duplicates(
             subset=["primary_id", "record_number"], ignore_index=True
         )
-        table_data = table_data.drop(["latitude", "longitude"])
+        table_data = table_data.drop(["latitude", "longitude"], axis=1)
     # Check primary keys can be used to build a unique index
     primary_keys = table_definition.primary_keys
     if table_name in [