|
7 | 7 | from typing import Any |
8 | 8 |
|
9 | 9 | import pytest |
10 | | -from dirty_equals import HasRepr, IsNumber, IsPartialDict |
| 10 | +from dirty_equals import HasRepr, IsNumber |
11 | 11 | from inline_snapshot import snapshot |
12 | 12 | from pydantic import BaseModel, TypeAdapter |
13 | 13 |
|
@@ -1172,115 +1172,238 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: |
1172 | 1172 |
|
1173 | 1173 | await example_dataset.evaluate(mock_async_task) |
1174 | 1174 |
|
1175 | | - spans = capfire.exporter.exported_spans_as_dict() |
| 1175 | + spans = capfire.exporter.exported_spans_as_dict(parse_json_attributes=True) |
1176 | 1176 | spans.sort(key=lambda s: s['start_time']) |
1177 | | - assert spans == [ |
1178 | | - { |
1179 | | - 'attributes': { |
1180 | | - 'averages': '{"name":"Averages","scores":{"confidence":1.0},"labels":{},"metrics":{},"assertions":1.0,"task_duration":1.0,"total_duration":5.0}', |
1181 | | - 'cases': '[{"name":"case1","inputs":{"query":"What is ' |
1182 | | - '2+2?"},"metadata":{"difficulty":"easy","category":"general"},"expected_output":{"answer":"4","confidence":1.0},"output":{"answer":"4","confidence":1.0},"metrics":{},"attributes":{},"scores":{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"labels":{},"assertions":{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"task_duration":1.0,"total_duration":6.0,"trace_id":"00000000000000000000000000000001","span_id":"0000000000000003"},{"name":"case2","inputs":{"query":"What ' |
1183 | | - 'is the capital of ' |
1184 | | - 'France?"},"metadata":{"difficulty":"medium","category":"geography"},"expected_output":{"answer":"Paris","confidence":1.0},"output":{"answer":"Paris","confidence":1.0},"metrics":{},"attributes":{},"scores":{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"labels":{},"assertions":{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"task_duration":1.0,"total_duration":4.0,"trace_id":"00000000000000000000000000000001","span_id":"0000000000000007"}]', |
1185 | | - 'code.filepath': 'test_dataset.py', |
1186 | | - 'code.function': 'test_evaluate_async_logfire', |
1187 | | - 'code.lineno': 123, |
1188 | | - 'logfire.json_schema': '{"type":"object","properties":{"name":{},"cases":{"type":"array"},"averages":{"type":"object"}}}', |
1189 | | - 'logfire.msg': 'evaluate mock_async_task', |
1190 | | - 'logfire.msg_template': 'evaluate {name}', |
1191 | | - 'logfire.span_type': 'span', |
1192 | | - 'name': 'mock_async_task', |
1193 | | - }, |
1194 | | - 'context': {'is_remote': False, 'span_id': 1, 'trace_id': 1}, |
1195 | | - 'end_time': 10000000000, |
1196 | | - 'name': 'evaluate {name}', |
1197 | | - 'parent': None, |
1198 | | - 'start_time': 1000000000, |
1199 | | - }, |
1200 | | - IsPartialDict( |
1201 | | - { |
1202 | | - 'attributes': { |
1203 | | - 'assertions': '{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}', |
1204 | | - 'attributes': '{}', |
| 1177 | + |
| 1178 | + for span in spans: |
| 1179 | + # These may or may not be present and may have weird values due to things running in async |
| 1180 | + span['attributes'].pop('code.filepath', None) |
| 1181 | + span['attributes'].pop('code.function', None) |
| 1182 | + span['attributes'].pop('code.lineno', None) |
| 1183 | + |
| 1184 | + assert [(span['name'], span['attributes']) for span in spans] == snapshot( |
| 1185 | + [ |
| 1186 | + ( |
| 1187 | + 'evaluate {name}', |
| 1188 | + { |
| 1189 | + 'name': 'mock_async_task', |
| 1190 | + 'logfire.msg_template': 'evaluate {name}', |
| 1191 | + 'logfire.msg': 'evaluate mock_async_task', |
| 1192 | + 'logfire.span_type': 'span', |
| 1193 | + 'cases': [ |
| 1194 | + { |
| 1195 | + 'name': 'case1', |
| 1196 | + 'inputs': {'query': 'What is 2+2?'}, |
| 1197 | + 'metadata': {'difficulty': 'easy', 'category': 'general'}, |
| 1198 | + 'expected_output': {'answer': '4', 'confidence': 1.0}, |
| 1199 | + 'output': {'answer': '4', 'confidence': 1.0}, |
| 1200 | + 'metrics': {}, |
| 1201 | + 'attributes': {}, |
| 1202 | + 'scores': { |
| 1203 | + 'confidence': { |
| 1204 | + 'name': 'confidence', |
| 1205 | + 'value': 1.0, |
| 1206 | + 'reason': None, |
| 1207 | + 'source': {'name': 'SimpleEvaluator', 'arguments': None}, |
| 1208 | + } |
| 1209 | + }, |
| 1210 | + 'labels': {}, |
| 1211 | + 'assertions': { |
| 1212 | + 'correct': { |
| 1213 | + 'name': 'correct', |
| 1214 | + 'value': True, |
| 1215 | + 'reason': None, |
| 1216 | + 'source': {'name': 'SimpleEvaluator', 'arguments': None}, |
| 1217 | + } |
| 1218 | + }, |
| 1219 | + 'task_duration': 1.0, |
| 1220 | + 'total_duration': 6.0, |
| 1221 | + 'trace_id': '00000000000000000000000000000001', |
| 1222 | + 'span_id': '0000000000000003', |
| 1223 | + }, |
| 1224 | + { |
| 1225 | + 'name': 'case2', |
| 1226 | + 'inputs': {'query': 'What is the capital of France?'}, |
| 1227 | + 'metadata': {'difficulty': 'medium', 'category': 'geography'}, |
| 1228 | + 'expected_output': {'answer': 'Paris', 'confidence': 1.0}, |
| 1229 | + 'output': {'answer': 'Paris', 'confidence': 1.0}, |
| 1230 | + 'metrics': {}, |
| 1231 | + 'attributes': {}, |
| 1232 | + 'scores': { |
| 1233 | + 'confidence': { |
| 1234 | + 'name': 'confidence', |
| 1235 | + 'value': 1.0, |
| 1236 | + 'reason': None, |
| 1237 | + 'source': {'name': 'SimpleEvaluator', 'arguments': None}, |
| 1238 | + } |
| 1239 | + }, |
| 1240 | + 'labels': {}, |
| 1241 | + 'assertions': { |
| 1242 | + 'correct': { |
| 1243 | + 'name': 'correct', |
| 1244 | + 'value': True, |
| 1245 | + 'reason': None, |
| 1246 | + 'source': {'name': 'SimpleEvaluator', 'arguments': None}, |
| 1247 | + } |
| 1248 | + }, |
| 1249 | + 'task_duration': 1.0, |
| 1250 | + 'total_duration': 4.0, |
| 1251 | + 'trace_id': '00000000000000000000000000000001', |
| 1252 | + 'span_id': '0000000000000007', |
| 1253 | + }, |
| 1254 | + ], |
| 1255 | + 'averages': { |
| 1256 | + 'name': 'Averages', |
| 1257 | + 'scores': {'confidence': 1.0}, |
| 1258 | + 'labels': {}, |
| 1259 | + 'metrics': {}, |
| 1260 | + 'assertions': 1.0, |
| 1261 | + 'task_duration': 1.0, |
| 1262 | + 'total_duration': 5.0, |
| 1263 | + }, |
| 1264 | + 'logfire.json_schema': { |
| 1265 | + 'type': 'object', |
| 1266 | + 'properties': {'name': {}, 'cases': {'type': 'array'}, 'averages': {'type': 'object'}}, |
| 1267 | + }, |
| 1268 | + }, |
| 1269 | + ), |
| 1270 | + ( |
| 1271 | + 'case: {case_name}', |
| 1272 | + { |
| 1273 | + 'task_name': 'mock_async_task', |
1205 | 1274 | 'case_name': 'case1', |
1206 | | - 'expected_output': '{"answer":"4","confidence":1.0}', |
1207 | | - 'inputs': '{"query":"What is 2+2?"}', |
1208 | | - 'labels': '{}', |
1209 | | - 'logfire.json_schema': '{"type":"object","properties":{"task_name":{},"case_name":{},"inputs":{"type":"object","title":"TaskInput","x-python-datatype":"PydanticModel"},"metadata":{"type":"object","title":"TaskMetadata","x-python-datatype":"PydanticModel"},"expected_output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"task_duration":{},"metrics":{"type":"object"},"attributes":{"type":"object"},"assertions":{"type":"object"},"scores":{"type":"object"},"labels":{"type":"object"}}}', |
1210 | | - 'logfire.msg': 'case: case1', |
| 1275 | + 'inputs': {'query': 'What is 2+2?'}, |
| 1276 | + 'metadata': {'difficulty': 'easy', 'category': 'general'}, |
| 1277 | + 'expected_output': {'answer': '4', 'confidence': 1.0}, |
1211 | 1278 | 'logfire.msg_template': 'case: {case_name}', |
| 1279 | + 'logfire.msg': 'case: case1', |
1212 | 1280 | 'logfire.span_type': 'span', |
1213 | | - 'metadata': '{"difficulty":"easy","category":"general"}', |
1214 | | - 'metrics': '{}', |
1215 | | - 'output': '{"answer":"4","confidence":1.0}', |
1216 | | - 'scores': '{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}', |
| 1281 | + 'output': {'answer': '4', 'confidence': 1.0}, |
1217 | 1282 | 'task_duration': 1.0, |
1218 | | - 'task_name': 'mock_async_task', |
| 1283 | + 'metrics': {}, |
| 1284 | + 'attributes': {}, |
| 1285 | + 'assertions': { |
| 1286 | + 'correct': { |
| 1287 | + 'name': 'correct', |
| 1288 | + 'value': True, |
| 1289 | + 'reason': None, |
| 1290 | + 'source': {'name': 'SimpleEvaluator', 'arguments': None}, |
| 1291 | + } |
| 1292 | + }, |
| 1293 | + 'scores': { |
| 1294 | + 'confidence': { |
| 1295 | + 'name': 'confidence', |
| 1296 | + 'value': 1.0, |
| 1297 | + 'reason': None, |
| 1298 | + 'source': {'name': 'SimpleEvaluator', 'arguments': None}, |
| 1299 | + } |
| 1300 | + }, |
| 1301 | + 'labels': {}, |
| 1302 | + 'logfire.json_schema': { |
| 1303 | + 'type': 'object', |
| 1304 | + 'properties': { |
| 1305 | + 'task_name': {}, |
| 1306 | + 'case_name': {}, |
| 1307 | + 'inputs': {'type': 'object', 'title': 'TaskInput', 'x-python-datatype': 'PydanticModel'}, |
| 1308 | + 'metadata': { |
| 1309 | + 'type': 'object', |
| 1310 | + 'title': 'TaskMetadata', |
| 1311 | + 'x-python-datatype': 'PydanticModel', |
| 1312 | + }, |
| 1313 | + 'expected_output': { |
| 1314 | + 'type': 'object', |
| 1315 | + 'title': 'TaskOutput', |
| 1316 | + 'x-python-datatype': 'PydanticModel', |
| 1317 | + }, |
| 1318 | + 'output': {'type': 'object', 'title': 'TaskOutput', 'x-python-datatype': 'PydanticModel'}, |
| 1319 | + 'task_duration': {}, |
| 1320 | + 'metrics': {'type': 'object'}, |
| 1321 | + 'attributes': {'type': 'object'}, |
| 1322 | + 'assertions': {'type': 'object'}, |
| 1323 | + 'scores': {'type': 'object'}, |
| 1324 | + 'labels': {'type': 'object'}, |
| 1325 | + }, |
| 1326 | + }, |
1219 | 1327 | }, |
1220 | | - 'context': {'is_remote': False, 'span_id': 3, 'trace_id': 1}, |
1221 | | - 'end_time': 8000000000, |
1222 | | - 'name': 'case: {case_name}', |
1223 | | - 'parent': {'is_remote': False, 'span_id': 1, 'trace_id': 1}, |
1224 | | - 'start_time': 2000000000, |
1225 | | - } |
1226 | | - ), |
1227 | | - IsPartialDict( |
1228 | | - { |
1229 | | - 'attributes': { |
1230 | | - 'logfire.json_schema': '{"type":"object","properties":{"task":{}}}', |
1231 | | - 'logfire.msg': 'execute mock_async_task', |
| 1328 | + ), |
| 1329 | + ( |
| 1330 | + 'execute {task}', |
| 1331 | + { |
| 1332 | + 'task': 'mock_async_task', |
1232 | 1333 | 'logfire.msg_template': 'execute {task}', |
| 1334 | + 'logfire.msg': 'execute mock_async_task', |
| 1335 | + 'logfire.json_schema': {'type': 'object', 'properties': {'task': {}}}, |
1233 | 1336 | 'logfire.span_type': 'span', |
1234 | | - 'task': 'mock_async_task', |
1235 | 1337 | }, |
1236 | | - 'context': {'is_remote': False, 'span_id': 5, 'trace_id': 1}, |
1237 | | - 'end_time': 4000000000, |
1238 | | - 'name': 'execute {task}', |
1239 | | - 'parent': {'is_remote': False, 'span_id': 3, 'trace_id': 1}, |
1240 | | - 'start_time': 3000000000, |
1241 | | - } |
1242 | | - ), |
1243 | | - IsPartialDict( |
1244 | | - { |
1245 | | - 'attributes': { |
1246 | | - 'assertions': '{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}', |
1247 | | - 'attributes': '{}', |
| 1338 | + ), |
| 1339 | + ( |
| 1340 | + 'case: {case_name}', |
| 1341 | + { |
| 1342 | + 'task_name': 'mock_async_task', |
1248 | 1343 | 'case_name': 'case2', |
1249 | | - 'expected_output': '{"answer":"Paris","confidence":1.0}', |
1250 | | - 'inputs': '{"query":"What is the capital of France?"}', |
1251 | | - 'labels': '{}', |
1252 | | - 'logfire.json_schema': '{"type":"object","properties":{"task_name":{},"case_name":{},"inputs":{"type":"object","title":"TaskInput","x-python-datatype":"PydanticModel"},"metadata":{"type":"object","title":"TaskMetadata","x-python-datatype":"PydanticModel"},"expected_output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"output":{"type":"object","title":"TaskOutput","x-python-datatype":"PydanticModel"},"task_duration":{},"metrics":{"type":"object"},"attributes":{"type":"object"},"assertions":{"type":"object"},"scores":{"type":"object"},"labels":{"type":"object"}}}', |
1253 | | - 'logfire.msg': 'case: case2', |
| 1344 | + 'inputs': {'query': 'What is the capital of France?'}, |
| 1345 | + 'metadata': {'difficulty': 'medium', 'category': 'geography'}, |
| 1346 | + 'expected_output': {'answer': 'Paris', 'confidence': 1.0}, |
1254 | 1347 | 'logfire.msg_template': 'case: {case_name}', |
| 1348 | + 'logfire.msg': 'case: case2', |
1255 | 1349 | 'logfire.span_type': 'span', |
1256 | | - 'metadata': '{"difficulty":"medium","category":"geography"}', |
1257 | | - 'metrics': '{}', |
1258 | | - 'output': '{"answer":"Paris","confidence":1.0}', |
1259 | | - 'scores': '{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}}', |
| 1350 | + 'output': {'answer': 'Paris', 'confidence': 1.0}, |
1260 | 1351 | 'task_duration': 1.0, |
1261 | | - 'task_name': 'mock_async_task', |
| 1352 | + 'metrics': {}, |
| 1353 | + 'attributes': {}, |
| 1354 | + 'assertions': { |
| 1355 | + 'correct': { |
| 1356 | + 'name': 'correct', |
| 1357 | + 'value': True, |
| 1358 | + 'reason': None, |
| 1359 | + 'source': {'name': 'SimpleEvaluator', 'arguments': None}, |
| 1360 | + } |
| 1361 | + }, |
| 1362 | + 'scores': { |
| 1363 | + 'confidence': { |
| 1364 | + 'name': 'confidence', |
| 1365 | + 'value': 1.0, |
| 1366 | + 'reason': None, |
| 1367 | + 'source': {'name': 'SimpleEvaluator', 'arguments': None}, |
| 1368 | + } |
| 1369 | + }, |
| 1370 | + 'labels': {}, |
| 1371 | + 'logfire.json_schema': { |
| 1372 | + 'type': 'object', |
| 1373 | + 'properties': { |
| 1374 | + 'task_name': {}, |
| 1375 | + 'case_name': {}, |
| 1376 | + 'inputs': {'type': 'object', 'title': 'TaskInput', 'x-python-datatype': 'PydanticModel'}, |
| 1377 | + 'metadata': { |
| 1378 | + 'type': 'object', |
| 1379 | + 'title': 'TaskMetadata', |
| 1380 | + 'x-python-datatype': 'PydanticModel', |
| 1381 | + }, |
| 1382 | + 'expected_output': { |
| 1383 | + 'type': 'object', |
| 1384 | + 'title': 'TaskOutput', |
| 1385 | + 'x-python-datatype': 'PydanticModel', |
| 1386 | + }, |
| 1387 | + 'output': {'type': 'object', 'title': 'TaskOutput', 'x-python-datatype': 'PydanticModel'}, |
| 1388 | + 'task_duration': {}, |
| 1389 | + 'metrics': {'type': 'object'}, |
| 1390 | + 'attributes': {'type': 'object'}, |
| 1391 | + 'assertions': {'type': 'object'}, |
| 1392 | + 'scores': {'type': 'object'}, |
| 1393 | + 'labels': {'type': 'object'}, |
| 1394 | + }, |
| 1395 | + }, |
1262 | 1396 | }, |
1263 | | - 'context': {'is_remote': False, 'span_id': 7, 'trace_id': 1}, |
1264 | | - 'end_time': 9000000000, |
1265 | | - 'name': 'case: {case_name}', |
1266 | | - 'parent': {'is_remote': False, 'span_id': 1, 'trace_id': 1}, |
1267 | | - 'start_time': 5000000000, |
1268 | | - } |
1269 | | - ), |
1270 | | - IsPartialDict( |
1271 | | - { |
1272 | | - 'attributes': { |
1273 | | - 'logfire.json_schema': '{"type":"object","properties":{"task":{}}}', |
1274 | | - 'logfire.msg': 'execute mock_async_task', |
| 1397 | + ), |
| 1398 | + ( |
| 1399 | + 'execute {task}', |
| 1400 | + { |
| 1401 | + 'task': 'mock_async_task', |
1275 | 1402 | 'logfire.msg_template': 'execute {task}', |
| 1403 | + 'logfire.msg': 'execute mock_async_task', |
| 1404 | + 'logfire.json_schema': {'type': 'object', 'properties': {'task': {}}}, |
1276 | 1405 | 'logfire.span_type': 'span', |
1277 | | - 'task': 'mock_async_task', |
1278 | 1406 | }, |
1279 | | - 'context': {'is_remote': False, 'span_id': 9, 'trace_id': 1}, |
1280 | | - 'end_time': 7000000000, |
1281 | | - 'name': 'execute {task}', |
1282 | | - 'parent': {'is_remote': False, 'span_id': 7, 'trace_id': 1}, |
1283 | | - 'start_time': 6000000000, |
1284 | | - } |
1285 | | - ), |
1286 | | - ] |
| 1407 | + ), |
| 1408 | + ] |
| 1409 | + ) |
0 commit comments