Skip to content

Commit

Permalink
807 ast data (#816)
Browse files Browse the repository at this point in the history
* All information available in the asteroid table was added to the prediction pipeline. #807

* Minor fixes

* Closed #807 - Prediction pipeline now records all asteroid data for each prediction event.

* Updated documentation

* Added progress bar dowload external inputs

* Added progress bar to update_asteroid_table command #807

* Created clean_duplicate_asteroids function to fix duplicate asteroids.

* Fields description and help text updated

* Fields description and help text updated

---------

Co-authored-by: rcboufleur <rcboufleur@gmail.com>
  • Loading branch information
glaubervila and rcboufleur authored Feb 6, 2024
1 parent 44aac7d commit d28fe6c
Show file tree
Hide file tree
Showing 13 changed files with 643 additions and 355 deletions.
86 changes: 75 additions & 11 deletions backend/tno/asteroid_table/asteroid_table_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
import traceback
import humanize

import functools
import tqdm
import shutil
# Function to download the file if it doesn't exist or if it has changed
def download_file_if_not_exists_or_changed(url, directory_path, filename, log):
"""
Expand Down Expand Up @@ -141,11 +144,20 @@ def download_file(url, local_filename):
- url (str): URL of the file to be downloaded.
- local_filename (str): Path and name for the file to be saved locally.
"""
response = requests.get(url, stream=True)
with open(local_filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
r = requests.get(url, stream=True)

file_size = int(r.headers.get('Content-Length', 0))
desc = "(Unknown total file size)" if file_size == 0 else ""
r.raw.read = functools.partial(r.raw.read, decode_content=True)
with tqdm.tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r_raw, f)

# r = requests.get(url, stream=True)
# with open(local_filename, 'wb') as file:
# for chunk in r.iter_content(chunk_size=1024):
# if chunk:
# file.write(chunk)


# Function to get the remote file size
Expand Down Expand Up @@ -256,7 +268,7 @@ def load_mpcorb_extended(local_filename, log):
# Log loading and processing time
end_time = time.time() # Record the end time
loading_time = end_time - start_time
log.debug(f"{local_filename} took {loading_time:.2f} seconds to load and process")
log.debug(f"{local_filename} took {loading_time:.0f} seconds to load and process")
except:
raise Exception(f"An error occurred while {error_message} in DataFrame read from: {local_filename}")

Expand Down Expand Up @@ -309,7 +321,7 @@ def load_ssoBFT(local_filename, log):
# Log loading and processing time
end_time = time.time() # Record the end time
loading_time = end_time - start_time
log.debug(f"{local_filename} took {loading_time:.2f} seconds to load and process")
log.debug(f"{local_filename} took {loading_time:.0f} seconds to load and process")
except:
raise Exception(f"An error occurred while {error_message} in DataFrame read from: {local_filename}")

Expand Down Expand Up @@ -620,14 +632,14 @@ def conform_astorb_lowell_obs_dynclass(dataframe, log):
log.info("Computing and parsing dynamical classes according to ASTORB database...")
log.debug("Estimating remaining time...")
total_rows = len(dataframe)
modref = 100 if (100 / total_rows) < 0.1 else np.ceil(total_rows*0.1)
modref = 1000 if (1000 / total_rows) < 0.1 else np.ceil(total_rows*0.1)
start_time = time.time() # Record the start time


for index in range(total_rows):
if index == modref:
remaining_time = (total_rows - modref)*(time.time() - start_time)/modref
log.debug(f'Remaining Time: {remaining_time:.2f} seconds...' if remaining_time <= 60 else f'Remaining Time: {remaining_time/60:.2f} minutes...' if remaining_time <= 3600 else f'Remaining Time: {remaining_time/3600:.2f} hours...')
log.debug(f'Remaining Time: {remaining_time:.0f} seconds...' if remaining_time <= 60 else f'Remaining Time: {remaining_time/60:.0f} minutes...' if remaining_time <= 3600 else f'Remaining Time: {remaining_time/3600:.0f} hours...')

try:
a = dataframe.iloc[index]['semimajor_axis']
Expand Down Expand Up @@ -708,6 +720,55 @@ def validate_last_obs_included(date_string):
except:
return '1800-01-01'

def clean_duplicate_asteroids(dataframe, category='name'):
"""
Clean a DataFrame by removing duplicate rows based on a specified category column.
Parameters:
-----------
dataframe : pandas DataFrame
The input DataFrame containing asteroid data.
category : str, optional
The name of the column used for grouping and identifying duplicates.
Default is 'name'.
Returns:
--------
pandas DataFrame
A cleaned DataFrame with duplicate rows removed.
Example:
--------
# Import pandas and create a sample DataFrame
import pandas as pd
data = {'name': ['Asteroid1', 'Asteroid2', 'Asteroid1', 'Asteroid3'],
'diameter_km': [5, 8, 5, 10]}
df = pd.DataFrame(data)
# Clean the DataFrame by removing duplicate asteroids based on 'name'
cleaned_df = clean_duplicate_asteroids(df, category='name')
# Display the cleaned DataFrame
print(cleaned_df)
Output:
name diameter_km
1 Asteroid2 8
3 Asteroid3 10
"""
data_groups = dataframe.groupby(category)

for name, group in data_groups:
if len(group) > 1:
index, idxmax = group.index.tolist(), group.count(axis=1).idxmax()
index.remove(idxmax)

# Drop duplicate rows from the original DataFrame
for i in index:
dataframe = dataframe.drop(i)

return dataframe


def asteroid_table_build(table_path, log):
"""
Expand All @@ -734,6 +795,8 @@ def asteroid_table_build(table_path, log):

# Development Only
# data_mpc = data_mpc.iloc[0:5, ]
# index = [1,2,3,4,5,131240,644901,201725,656765,470433,710161]
# data_mpc = data_mpc.iloc[index]

# Define the URL of the file
ssoBFT_url = "https://ssp.imcce.fr/data/ssoBFT-latest.ecsv.bz2"
Expand Down Expand Up @@ -768,6 +831,9 @@ def asteroid_table_build(table_path, log):

# Fix duplicated aliases
asteroid_table['alias'] = fix_duplicated_alias(asteroid_table['alias'].values)

# Fix duplicated rows
asteroid_table = clean_duplicate_asteroids(asteroid_table, category='name')

asteroid_table = asteroid_table.rename(
columns={
Expand All @@ -789,8 +855,6 @@ def asteroid_table_build(table_path, log):
]
)


# import_asteroid_table(output_table)
return asteroid_table
except Exception as e:
raise Exception(e)
Expand Down
3 changes: 3 additions & 0 deletions backend/tno/asteroid_table/asteroid_table_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def run_update_asteroid_table(self):
# Create dataframe with asteroids
df = asteroid_table_build(str(self.path), self.log)

# Write the dataframe to csv file as a debug file.
df.to_csv(self.path.joinpath("asteroid_table_debug.csv"), index=False)

# Import dataframe to database
self.asteroids_after = import_asteroid_table(df, self.log)

Expand Down
Binary file removed backend/tno/fixtures/asteroid.json.gz
Binary file not shown.
Loading

0 comments on commit d28fe6c

Please sign in to comment.