Skip to content

Commit b6507da

Browse files
authored
Cleanup snapshot in test_evaluate_async_logfire (#2538)
1 parent 53c23fe commit b6507da

File tree

1 file changed

+219
-96
lines changed

1 file changed

+219
-96
lines changed

tests/evals/test_dataset.py

Lines changed: 219 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing import Any
88

99
import pytest
10-
from dirty_equals import HasRepr, IsNumber, IsPartialDict
10+
from dirty_equals import HasRepr, IsNumber
1111
from inline_snapshot import snapshot
1212
from pydantic import BaseModel, TypeAdapter
1313

@@ -1172,115 +1172,238 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput:
11721172

11731173
await example_dataset.evaluate(mock_async_task)
11741174

1175-
spans = capfire.exporter.exported_spans_as_dict()
1175+
spans = capfire.exporter.exported_spans_as_dict(parse_json_attributes=True)
11761176
spans.sort(key=lambda s: s['start_time'])
1177-
assert spans == [
1178-
{
1179-
'attributes': {
1180-
'averages': '{"name":"Averages","scores":{"confidence":1.0},"labels":{},"metrics":{},"assertions":1.0,"task_duration":1.0,"total_duration":5.0}',
1181-
'cases': '[{"name":"case1","inputs":{"query":"What is '
1182-
'2+2?"},"metadata":{"difficulty":"easy","category":"general"},"expected_output":{"answer":"4","confidence":1.0},"output":{"answer":"4","confidence":1.0},"metrics":{},"attributes":{},"scores":{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"labels":{},"assertions":{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"task_duration":1.0,"total_duration":6.0,"trace_id":"00000000000000000000000000000001","span_id":"0000000000000003"},{"name":"case2","inputs":{"query":"What '
1183-
'is the capital of '
1184-
'France?"},"metadata":{"difficulty":"medium","category":"geography"},"expected_output":{"answer":"Paris","confidence":1.0},"output":{"answer":"Paris","confidence":1.0},"metrics":{},"attributes":{},"scores":{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"labels":{},"assertions":{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"task_duration":1.0,"total_duration":4.0,"trace_id":"00000000000000000000000000000001","span_id":"0000000000000007"}]',
1185-
'code.filepath': 'test_dataset.py',
1186-
'code.function': 'test_evaluate_async_logfire',
1187-
'code.lineno': 123,
1188-
'logfire.json_schema': '{"type":"object","properties":{"name":{},"cases":{"type":"array"},"averages":{"type":"object"}}}',
1189-
'logfire.msg': 'evaluate mock_async_task',
1190-
'logfire.msg_template': 'evaluate {name}',
1191-
'logfire.span_type': 'span',
1192-
'name': 'mock_async_task',
1193-
},
1194-
'context': {'is_remote': False, 'span_id': 1, 'trace_id': 1},
1195-
'end_time': 10000000000,
1196-
'name': 'evaluate {name}',
1197-
'parent': None,
1198-
'start_time': 1000000000,
1199-
},
1200-
IsPartialDict(
1201-
{
1202-
'attributes': {
1203-
'assertions': '{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}',
1204-
'attributes': '{}',
1177+
1178+
for span in spans:
1179+
# These may or may not be present and may have weird values due to things running in async
1180+
span['attributes'].pop('code.filepath', None)
1181+
span['attributes'].pop('code.function', None)
1182+
span['attributes'].pop('code.lineno', None)
1183+
1184+
assert [(span['name'], span['attributes']) for span in spans] == snapshot(
1185+
[
1186+
(
1187+
'evaluate {name}',
1188+
{
1189+
'name': 'mock_async_task',
1190+
'logfire.msg_template': 'evaluate {name}',
1191+
'logfire.msg': 'evaluate mock_async_task',
1192+
'logfire.span_type': 'span',
1193+
'cases': [
1194+
{
1195+
'name': 'case1',
1196+
'inputs': {'query': 'What is 2+2?'},
1197+
'metadata': {'difficulty': 'easy', 'category': 'general'},
1198+
'expected_output': {'answer': '4', 'confidence': 1.0},
1199+
'output': {'answer': '4', 'confidence': 1.0},
1200+
'metrics': {},
1201+
'attributes': {},
1202+
'scores': {
1203+
'confidence': {
1204+
'name': 'confidence',
1205+
'value': 1.0,
1206+
'reason': None,
1207+
'source': {'name': 'SimpleEvaluator', 'arguments': None},
1208+
}
1209+
},
1210+
'labels': {},
1211+
'assertions': {
1212+
'correct': {
1213+
'name': 'correct',
1214+
'value': True,
1215+
'reason': None,
1216+
'source': {'name': 'SimpleEvaluator', 'arguments': None},
1217+
}
1218+
},
1219+
'task_duration': 1.0,
1220+
'total_duration': 6.0,
1221+
'trace_id': '00000000000000000000000000000001',
1222+
'span_id': '0000000000000003',
1223+
},
1224+
{
1225+
'name': 'case2',
1226+
'inputs': {'query': 'What is the capital of France?'},
1227+
'metadata': {'difficulty': 'medium', 'category': 'geography'},
1228+
'expected_output': {'answer': 'Paris', 'confidence': 1.0},
1229+
'output': {'answer': 'Paris', 'confidence': 1.0},
1230+
'metrics': {},
1231+
'attributes': {},
1232+
'scores': {
1233+
'confidence': {
1234+
'name': 'confidence',
1235+
'value': 1.0,
1236+
'reason': None,
1237+
'source': {'name': 'SimpleEvaluator', 'arguments': None},
1238+
}
1239+
},
1240+
'labels': {},
1241+
'assertions': {
1242+
'correct': {
1243+
'name': 'correct',
1244+
'value': True,
1245+
'reason': None,
1246+
'source': {'name': 'SimpleEvaluator', 'arguments': None},
1247+
}
1248+
},
1249+
'task_duration': 1.0,
1250+
'total_duration': 4.0,
1251+
'trace_id': '00000000000000000000000000000001',
1252+
'span_id': '0000000000000007',
1253+
},
1254+
],
1255+
'averages': {
1256+
'name': 'Averages',
1257+
'scores': {'confidence': 1.0},
1258+
'labels': {},
1259+
'metrics': {},
1260+
'assertions': 1.0,
1261+
'task_duration': 1.0,
1262+
'total_duration': 5.0,
1263+
},
1264+
'logfire.json_schema': {
1265+
'type': 'object',
1266+
'properties': {'name': {}, 'cases': {'type': 'array'}, 'averages': {'type': 'object'}},
1267+
},
1268+
},
1269+
),
1270+
(
1271+
'case: {case_name}',
1272+
{
1273+
'task_name': 'mock_async_task',
12051274
'case_name': 'case1',
1206-
'expected_output': '{"answer":"4","confidence":1.0}',
1207-
'inputs': '{"query":"What is 2+2?"}',
1208-
'labels': '{}',
1209-
'logfire.json_schema': '{"type":"object","properties":{"task_name":{},"case_name":{},"inputs":{"type":"object","title":"TaskInput","x-python-datatype":"PydanticModel"},"metadata":{"type":"object","title":"TaskMetadata","x-python-datatype":"PydanticModel"},"expected_output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"task_duration":{},"metrics":{"type":"object"},"attributes":{"type":"object"},"assertions":{"type":"object"},"scores":{"type":"object"},"labels":{"type":"object"}}}',
1210-
'logfire.msg': 'case: case1',
1275+
'inputs': {'query': 'What is 2+2?'},
1276+
'metadata': {'difficulty': 'easy', 'category': 'general'},
1277+
'expected_output': {'answer': '4', 'confidence': 1.0},
12111278
'logfire.msg_template': 'case: {case_name}',
1279+
'logfire.msg': 'case: case1',
12121280
'logfire.span_type': 'span',
1213-
'metadata': '{"difficulty":"easy","category":"general"}',
1214-
'metrics': '{}',
1215-
'output': '{"answer":"4","confidence":1.0}',
1216-
'scores': '{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}',
1281+
'output': {'answer': '4', 'confidence': 1.0},
12171282
'task_duration': 1.0,
1218-
'task_name': 'mock_async_task',
1283+
'metrics': {},
1284+
'attributes': {},
1285+
'assertions': {
1286+
'correct': {
1287+
'name': 'correct',
1288+
'value': True,
1289+
'reason': None,
1290+
'source': {'name': 'SimpleEvaluator', 'arguments': None},
1291+
}
1292+
},
1293+
'scores': {
1294+
'confidence': {
1295+
'name': 'confidence',
1296+
'value': 1.0,
1297+
'reason': None,
1298+
'source': {'name': 'SimpleEvaluator', 'arguments': None},
1299+
}
1300+
},
1301+
'labels': {},
1302+
'logfire.json_schema': {
1303+
'type': 'object',
1304+
'properties': {
1305+
'task_name': {},
1306+
'case_name': {},
1307+
'inputs': {'type': 'object', 'title': 'TaskInput', 'x-python-datatype': 'PydanticModel'},
1308+
'metadata': {
1309+
'type': 'object',
1310+
'title': 'TaskMetadata',
1311+
'x-python-datatype': 'PydanticModel',
1312+
},
1313+
'expected_output': {
1314+
'type': 'object',
1315+
'title': 'TaskOutput',
1316+
'x-python-datatype': 'PydanticModel',
1317+
},
1318+
'output': {'type': 'object', 'title': 'TaskOutput', 'x-python-datatype': 'PydanticModel'},
1319+
'task_duration': {},
1320+
'metrics': {'type': 'object'},
1321+
'attributes': {'type': 'object'},
1322+
'assertions': {'type': 'object'},
1323+
'scores': {'type': 'object'},
1324+
'labels': {'type': 'object'},
1325+
},
1326+
},
12191327
},
1220-
'context': {'is_remote': False, 'span_id': 3, 'trace_id': 1},
1221-
'end_time': 8000000000,
1222-
'name': 'case: {case_name}',
1223-
'parent': {'is_remote': False, 'span_id': 1, 'trace_id': 1},
1224-
'start_time': 2000000000,
1225-
}
1226-
),
1227-
IsPartialDict(
1228-
{
1229-
'attributes': {
1230-
'logfire.json_schema': '{"type":"object","properties":{"task":{}}}',
1231-
'logfire.msg': 'execute mock_async_task',
1328+
),
1329+
(
1330+
'execute {task}',
1331+
{
1332+
'task': 'mock_async_task',
12321333
'logfire.msg_template': 'execute {task}',
1334+
'logfire.msg': 'execute mock_async_task',
1335+
'logfire.json_schema': {'type': 'object', 'properties': {'task': {}}},
12331336
'logfire.span_type': 'span',
1234-
'task': 'mock_async_task',
12351337
},
1236-
'context': {'is_remote': False, 'span_id': 5, 'trace_id': 1},
1237-
'end_time': 4000000000,
1238-
'name': 'execute {task}',
1239-
'parent': {'is_remote': False, 'span_id': 3, 'trace_id': 1},
1240-
'start_time': 3000000000,
1241-
}
1242-
),
1243-
IsPartialDict(
1244-
{
1245-
'attributes': {
1246-
'assertions': '{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}',
1247-
'attributes': '{}',
1338+
),
1339+
(
1340+
'case: {case_name}',
1341+
{
1342+
'task_name': 'mock_async_task',
12481343
'case_name': 'case2',
1249-
'expected_output': '{"answer":"Paris","confidence":1.0}',
1250-
'inputs': '{"query":"What is the capital of France?"}',
1251-
'labels': '{}',
1252-
'logfire.json_schema': '{"type":"object","properties":{"task_name":{},"case_name":{},"inputs":{"type":"object","title":"TaskInput","x-python-datatype":"PydanticModel"},"metadata":{"type":"object","title":"TaskMetadata","x-python-datatype":"PydanticModel"},"expected_output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"task_duration":{},"metrics":{"type":"object"},"attributes":{"type":"object"},"assertions":{"type":"object"},"scores":{"type":"object"},"labels":{"type":"object"}}}',
1253-
'logfire.msg': 'case: case2',
1344+
'inputs': {'query': 'What is the capital of France?'},
1345+
'metadata': {'difficulty': 'medium', 'category': 'geography'},
1346+
'expected_output': {'answer': 'Paris', 'confidence': 1.0},
12541347
'logfire.msg_template': 'case: {case_name}',
1348+
'logfire.msg': 'case: case2',
12551349
'logfire.span_type': 'span',
1256-
'metadata': '{"difficulty":"medium","category":"geography"}',
1257-
'metrics': '{}',
1258-
'output': '{"answer":"Paris","confidence":1.0}',
1259-
'scores': '{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}',
1350+
'output': {'answer': 'Paris', 'confidence': 1.0},
12601351
'task_duration': 1.0,
1261-
'task_name': 'mock_async_task',
1352+
'metrics': {},
1353+
'attributes': {},
1354+
'assertions': {
1355+
'correct': {
1356+
'name': 'correct',
1357+
'value': True,
1358+
'reason': None,
1359+
'source': {'name': 'SimpleEvaluator', 'arguments': None},
1360+
}
1361+
},
1362+
'scores': {
1363+
'confidence': {
1364+
'name': 'confidence',
1365+
'value': 1.0,
1366+
'reason': None,
1367+
'source': {'name': 'SimpleEvaluator', 'arguments': None},
1368+
}
1369+
},
1370+
'labels': {},
1371+
'logfire.json_schema': {
1372+
'type': 'object',
1373+
'properties': {
1374+
'task_name': {},
1375+
'case_name': {},
1376+
'inputs': {'type': 'object', 'title': 'TaskInput', 'x-python-datatype': 'PydanticModel'},
1377+
'metadata': {
1378+
'type': 'object',
1379+
'title': 'TaskMetadata',
1380+
'x-python-datatype': 'PydanticModel',
1381+
},
1382+
'expected_output': {
1383+
'type': 'object',
1384+
'title': 'TaskOutput',
1385+
'x-python-datatype': 'PydanticModel',
1386+
},
1387+
'output': {'type': 'object', 'title': 'TaskOutput', 'x-python-datatype': 'PydanticModel'},
1388+
'task_duration': {},
1389+
'metrics': {'type': 'object'},
1390+
'attributes': {'type': 'object'},
1391+
'assertions': {'type': 'object'},
1392+
'scores': {'type': 'object'},
1393+
'labels': {'type': 'object'},
1394+
},
1395+
},
12621396
},
1263-
'context': {'is_remote': False, 'span_id': 7, 'trace_id': 1},
1264-
'end_time': 9000000000,
1265-
'name': 'case: {case_name}',
1266-
'parent': {'is_remote': False, 'span_id': 1, 'trace_id': 1},
1267-
'start_time': 5000000000,
1268-
}
1269-
),
1270-
IsPartialDict(
1271-
{
1272-
'attributes': {
1273-
'logfire.json_schema': '{"type":"object","properties":{"task":{}}}',
1274-
'logfire.msg': 'execute mock_async_task',
1397+
),
1398+
(
1399+
'execute {task}',
1400+
{
1401+
'task': 'mock_async_task',
12751402
'logfire.msg_template': 'execute {task}',
1403+
'logfire.msg': 'execute mock_async_task',
1404+
'logfire.json_schema': {'type': 'object', 'properties': {'task': {}}},
12761405
'logfire.span_type': 'span',
1277-
'task': 'mock_async_task',
12781406
},
1279-
'context': {'is_remote': False, 'span_id': 9, 'trace_id': 1},
1280-
'end_time': 7000000000,
1281-
'name': 'execute {task}',
1282-
'parent': {'is_remote': False, 'span_id': 7, 'trace_id': 1},
1283-
'start_time': 6000000000,
1284-
}
1285-
),
1286-
]
1407+
),
1408+
]
1409+
)

0 commit comments

Comments
 (0)