Skip to content

Commit b1c8bad

Browse files
committed
all testcases are clean
1 parent 5f6fa4a commit b1c8bad

File tree

2 files changed

+42
-53
lines changed

2 files changed

+42
-53
lines changed

bigframes/blob/_functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -425,10 +425,10 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str:
425425
if page_extract_text:
426426
all_text += page_extract_text
427427

428-
result_dict = {"status": None, "content": all_text}
428+
result_dict = {"status": "", "content": all_text}
429429

430430
except Exception as e:
431-
result_dict = {"status": str(e), "content": None}
431+
result_dict = {"status": str(e), "content": ""}
432432

433433
result_json = json.dumps(result_dict)
434434
return result_json

tests/system/large/blob/test_function.py

Lines changed: 40 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -295,83 +295,72 @@ def pdf_uris() -> list[str]:
295295
]
296296

297297

298+
@pytest.mark.parametrize("verbose", (True, False))
298299
def test_blob_pdf_extract(
299300
pdf_mm_df: bpd.DataFrame,
301+
verbose: bool,
300302
bq_connection: str,
301303
):
302304
bigframes.options.experiments.blob = True
303305

304-
actual_exploded = (
306+
actual = (
305307
pdf_mm_df["pdf"]
306-
.blob.pdf_extract(connection=bq_connection, verbose=True)
307-
.struct.explode()
308+
.blob.pdf_extract(connection=bq_connection, verbose=verbose)
309+
.explode()
308310
.to_pandas()
309311
)
310-
# content = [
311-
# None,
312-
# '"Sample PDF This is a testing file. Some dummy messages are used for testing purposes. "',
313-
# ]
314-
expected_df = pd.DataFrame(
315-
[
316-
{"status": '"File has not been decrypted"', "content": None},
317-
{
318-
"status": None,
319-
"content": '"Sample PDF This is a testing file. Some dummy messages are used for testing purposes. "',
320-
},
321-
]
322-
)
323-
actual = pd.DataFrame(
312+
313+
extract_data = [
314+
{"status": "File has not been decrypted", "content": ""},
324315
{
325-
"values": actual_exploded.to_dict("records"),
326-
}
327-
)
316+
"status": "",
317+
"content": "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. ",
318+
},
319+
]
320+
if verbose:
321+
expected = pd.Series(extract_data)
322+
else:
323+
content_values = [item["content"] for item in extract_data]
324+
expected = pd.Series(content_values)
328325

329-
pd.testing.assert_frame_equal(
326+
pd.testing.assert_series_equal(
330327
actual,
331-
expected_df,
328+
expected,
332329
check_dtype=False,
333-
check_index_type=False,
330+
check_index=False,
331+
check_names=False,
334332
)
335333

336334

337-
def test_blob_pdf_chunk(pdf_mm_df: bpd.DataFrame, bq_connection: str):
335+
@pytest.mark.parametrize("verbose", (True, False))
336+
def test_blob_pdf_chunk(pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str):
338337
bigframes.options.experiments.blob = True
339338

340-
actual_exploded = (
341-
pdf_mm_df["pdf"]
342-
.blob.pdf_chunk(connection=bq_connection, chunk_size=50, overlap_size=10)
343-
.struct.explode()
344-
.to_pandas()
339+
actual = pdf_mm_df["pdf"].blob.pdf_chunk(
340+
connection=bq_connection, chunk_size=50, overlap_size=10, verbose=verbose
345341
)
346-
extract_data = [
347-
{"status": '"File has not been decrypted"', "content": []},
342+
343+
chunk_data = [
344+
{"status": "File has not been decrypted", "content": []},
348345
{
349-
"status": '""',
346+
"status": "",
350347
"content": [
351348
"Sample PDF This is a testing file. Some ",
352349
"dummy messages are used for testing ",
353350
"purposes. ",
354351
],
355352
},
356353
]
357-
expected_df = pd.DataFrame(
358-
{
359-
"pdf": pdf_uris,
360-
"authorizer": [bq_connection.casefold(), bq_connection.casefold()],
361-
"values": extract_data,
362-
}
363-
)
364-
actual = pd.DataFrame(
365-
{
366-
"pdf": pdf_uris,
367-
"authorizer": [bq_connection.casefold(), bq_connection.casefold()],
368-
"values": actual_exploded.to_dict("records"),
369-
}
370-
)
371-
372-
pd.testing.assert_frame_equal(
373-
actual,
374-
expected_df,
354+
if verbose:
355+
actual = actual.explode()
356+
expected = pd.Series(chunk_data)
357+
else:
358+
content_values = [item["content"] for item in chunk_data]
359+
expected = pd.Series(content_values)
360+
pd.testing.assert_series_equal(
361+
actual.to_pandas(),
362+
expected,
375363
check_dtype=False,
376-
check_index_type=False,
364+
check_index=False,
365+
check_names=False,
377366
)

0 commit comments

Comments
 (0)