-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Metric importer refactor #1035
Metric importer refactor #1035
Changes from 15 commits
09c4375
8ca1e73
38352b2
c048ba1
1f9ba39
e502173
27ca35b
d045544
c83f2a3
93dd25a
3f4bfc4
43c7551
4a4b09f
db14522
8651deb
e92dfce
91e7310
01d8835
37780b0
3f0a51d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from io import StringIO | ||
|
||
from lib.db import DB | ||
from metric_providers.network.connections.tcpdump.system.provider import generate_stats_string | ||
|
||
def import_measurements_new(df, metric_name, run_id): | ||
ArneTR marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
df['measurement_metric_id'] = None # prepare | ||
detail_names = df[['detail_name', 'unit']].drop_duplicates() | ||
ArneTR marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
for _, row in detail_names.iterrows(): | ||
measurement_metric_id = DB().fetch_one(''' | ||
INSERT INTO measurement_metrics (run_id, metric, detail_name, unit) | ||
VALUES (%s, %s, %s, %s) | ||
RETURNING id | ||
''', params=(run_id, metric_name, row['detail_name'], row['unit']))[0] | ||
df.loc[(df['detail_name'] == row['detail_name']) & (df['unit'] == row['unit']), 'measurement_metric_id'] = measurement_metric_id | ||
Comment on lines
+11
to
+17
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. style: Consider batching the INSERT statements for better performance with many detail_names There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @greptileai: but how do I do that, when I need to return the ID for every one? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can use a VALUES clause with multiple rows and still get the returned IDs using unnest. Here's how: INSERT INTO measurement_metrics (run_id, metric, detail_name, unit)
VALUES %s
RETURNING id, detail_name, unit Then construct the values tuple list in Python and use psycopg's execute_values(). This will return all IDs matched with their detail_name and unit, which you can then use to update the dataframe in one go. Would you like me to show you the specific code changes needed to implement this? |
||
|
||
f = StringIO(df[['measurement_metric_id', 'value', 'time']] | ||
.to_csv(index=False, header=False)) | ||
DB().copy_from(file=f, table='measurement_values', columns=['measurement_metric_id', 'value', 'time'], sep=',') | ||
Comment on lines
+19
to
+21
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. style: Consider using a context manager (with statement) for the StringIO object to ensure proper cleanup |
||
f.close() | ||
|
||
def import_measurements(df, metric_name, run_id, containers=None): | ||
|
||
if metric_name == 'network_connections_proxy_container_dockerproxy': | ||
|
||
df['run_id'] = run_id | ||
f = StringIO(df.to_csv(index=False, header=False)) | ||
DB().copy_from(file=f, table='network_intercepts', columns=df.columns, sep=',') | ||
ArneTR marked this conversation as resolved.
Show resolved
Hide resolved
|
||
f.close() | ||
|
||
elif metric_name == 'network_connections_tcpdump_system': | ||
DB().query(""" | ||
UPDATE runs | ||
SET logs= COALESCE(logs, '') || %s -- append | ||
WHERE id = %s | ||
""", params=(generate_stats_string(df), run_id)) | ||
|
||
else: | ||
|
||
if 'container_id' in df.columns: | ||
df = map_container_id_to_detail_name(df, containers) | ||
ArneTR marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
df['run_id'] = run_id | ||
|
||
f = StringIO(df.to_csv(index=False, header=False)) | ||
DB().copy_from(file=f, table='measurements', columns=df.columns, sep=',') | ||
f.close() | ||
|
||
def map_container_id_to_detail_name(df, containers): | ||
df['detail_name'] = df.container_id | ||
for container_id in containers: | ||
df.loc[df.detail_name == container_id, 'detail_name'] = containers[container_id]['name'] | ||
df = df.drop('container_id', axis=1) | ||
|
||
return df |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
style: Consider adding indexes on resolution columns if they will be frequently queried for analysis or filtering