807 ast data (#816)

* All information available in the asteroid table was added to the prediction pipeline. #807 * Minor fixes * Closed #807 - Prediction pipeline now records all asteroid data for each prediction event. * Updated documentation * Added progress bar dowload external inputs * Added progress bar to update_asteroid_table command #807 * Created clean_duplicate_asteroids function to fix duplicate asteroids. * Fields description and help text updated * Fields description and help text updated --------- Co-authored-by: rcboufleur <rcboufleur@gmail.com>
linea-it · Feb 6, 2024 · d28fe6c · d28fe6c
1 parent 44aac7d
commit d28fe6c
Show file tree

Hide file tree

Showing 13 changed files with 643 additions and 355 deletions.
diff --git a/backend/tno/asteroid_table/asteroid_table_build.py b/backend/tno/asteroid_table/asteroid_table_build.py
@@ -46,6 +46,9 @@
 import traceback
 import humanize
 
+import functools
+import tqdm
+import shutil
 # Function to download the file if it doesn't exist or if it has changed
 def download_file_if_not_exists_or_changed(url, directory_path, filename, log):
     """
@@ -141,11 +144,20 @@ def download_file(url, local_filename):
     - url (str): URL of the file to be downloaded.
     - local_filename (str): Path and name for the file to be saved locally.
     """
-    response = requests.get(url, stream=True)
-    with open(local_filename, 'wb') as file:
-        for chunk in response.iter_content(chunk_size=1024):
-            if chunk:
-                file.write(chunk)
+    r = requests.get(url, stream=True)
+
+    file_size = int(r.headers.get('Content-Length', 0))
+    desc = "(Unknown total file size)" if file_size == 0 else ""
+    r.raw.read = functools.partial(r.raw.read, decode_content=True)
+    with tqdm.tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
+        with open(local_filename, 'wb') as f:
+            shutil.copyfileobj(r_raw, f)
+
+    # r = requests.get(url, stream=True)
+    # with open(local_filename, 'wb') as file:
+    #     for chunk in r.iter_content(chunk_size=1024):
+    #         if chunk:
+    #             file.write(chunk)
 
 
 # Function to get the remote file size
@@ -256,7 +268,7 @@ def load_mpcorb_extended(local_filename, log):
         # Log loading and processing time
         end_time = time.time()    # Record the end time
         loading_time = end_time - start_time
-        log.debug(f"{local_filename} took {loading_time:.2f} seconds to load and process")
+        log.debug(f"{local_filename} took {loading_time:.0f} seconds to load and process")
     except:
         raise Exception(f"An error occurred while {error_message} in DataFrame read from: {local_filename}")
 
@@ -309,7 +321,7 @@ def load_ssoBFT(local_filename, log):
         # Log loading and processing time
         end_time = time.time()    # Record the end time
         loading_time = end_time - start_time
-        log.debug(f"{local_filename} took {loading_time:.2f} seconds to load and process")
+        log.debug(f"{local_filename} took {loading_time:.0f} seconds to load and process")
     except:
         raise Exception(f"An error occurred while {error_message} in DataFrame read from: {local_filename}")
 
@@ -620,14 +632,14 @@ def conform_astorb_lowell_obs_dynclass(dataframe, log):
     log.info("Computing and parsing dynamical classes according to ASTORB database...")
     log.debug("Estimating remaining time...")
     total_rows = len(dataframe)
-    modref = 100 if (100 / total_rows) < 0.1 else np.ceil(total_rows*0.1)
+    modref = 1000 if (1000 / total_rows) < 0.1 else np.ceil(total_rows*0.1)
     start_time = time.time()  # Record the start time
 
 
     for index in range(total_rows):
         if index == modref:
             remaining_time = (total_rows - modref)*(time.time() - start_time)/modref
-            log.debug(f'Remaining Time: {remaining_time:.2f} seconds...' if remaining_time <= 60 else f'Remaining Time: {remaining_time/60:.2f} minutes...' if remaining_time <= 3600 else f'Remaining Time: {remaining_time/3600:.2f} hours...')
+            log.debug(f'Remaining Time: {remaining_time:.0f} seconds...' if remaining_time <= 60 else f'Remaining Time: {remaining_time/60:.0f} minutes...' if remaining_time <= 3600 else f'Remaining Time: {remaining_time/3600:.0f} hours...')
 
         try:
             a = dataframe.iloc[index]['semimajor_axis']
@@ -708,6 +720,55 @@ def validate_last_obs_included(date_string):
         except:
             return '1800-01-01'
 
+def clean_duplicate_asteroids(dataframe, category='name'):
+    """
+    Clean a DataFrame by removing duplicate rows based on a specified category column.
+
+    Parameters:
+    -----------
+    dataframe : pandas DataFrame
+        The input DataFrame containing asteroid data.
+    category : str, optional
+        The name of the column used for grouping and identifying duplicates. 
+        Default is 'name'.
+
+    Returns:
+    --------
+    pandas DataFrame
+        A cleaned DataFrame with duplicate rows removed.
+
+    Example:
+    --------
+    # Import pandas and create a sample DataFrame
+    import pandas as pd
+    data = {'name': ['Asteroid1', 'Asteroid2', 'Asteroid1', 'Asteroid3'],
+            'diameter_km': [5, 8, 5, 10]}
+    df = pd.DataFrame(data)
+
+    # Clean the DataFrame by removing duplicate asteroids based on 'name'
+    cleaned_df = clean_duplicate_asteroids(df, category='name')
+
+    # Display the cleaned DataFrame
+    print(cleaned_df)
+
+    Output:
+        name  diameter_km
+    1  Asteroid2            8
+    3  Asteroid3           10
+    """
+    data_groups = dataframe.groupby(category)
+
+    for name, group in data_groups:
+        if len(group) > 1:
+            index, idxmax = group.index.tolist(), group.count(axis=1).idxmax()
+            index.remove(idxmax)
+
+            # Drop duplicate rows from the original DataFrame
+            for i in index:
+                dataframe = dataframe.drop(i)
+
+    return dataframe
+
 
 def asteroid_table_build(table_path, log):
     """
@@ -734,6 +795,8 @@ def asteroid_table_build(table_path, log):
 
         # Development Only
         # data_mpc = data_mpc.iloc[0:5, ]
+        # index = [1,2,3,4,5,131240,644901,201725,656765,470433,710161]
+        # data_mpc = data_mpc.iloc[index]
 
         # Define the URL of the file
         ssoBFT_url = "https://ssp.imcce.fr/data/ssoBFT-latest.ecsv.bz2"
@@ -768,6 +831,9 @@ def asteroid_table_build(table_path, log):
 
         # Fix duplicated aliases
         asteroid_table['alias'] = fix_duplicated_alias(asteroid_table['alias'].values)
+
+        # Fix duplicated rows
+        asteroid_table = clean_duplicate_asteroids(asteroid_table, category='name')
 
         asteroid_table = asteroid_table.rename(
             columns={
@@ -789,8 +855,6 @@ def asteroid_table_build(table_path, log):
             ]
         )
 
-
-        # import_asteroid_table(output_table)
         return asteroid_table
     except Exception as e:
         raise Exception(e)

diff --git a/backend/tno/asteroid_table/asteroid_table_manager.py b/backend/tno/asteroid_table/asteroid_table_manager.py
@@ -59,6 +59,9 @@ def run_update_asteroid_table(self):
             # Create dataframe with asteroids
             df = asteroid_table_build(str(self.path), self.log)
 
+            # Write the dataframe to csv file as a debug file.
+            df.to_csv(self.path.joinpath("asteroid_table_debug.csv"), index=False)
+
             # Import dataframe to database
             self.asteroids_after = import_asteroid_table(df, self.log)
 

diff --git a/backend/tno/fixtures/asteroid.json.gz b/backend/tno/fixtures/asteroid.json.gz