@@ -295,83 +295,72 @@ def pdf_uris() -> list[str]:
295295 ]
296296
297297
298+ @pytest .mark .parametrize ("verbose" , (True , False ))
298299def test_blob_pdf_extract (
299300 pdf_mm_df : bpd .DataFrame ,
301+ verbose : bool ,
300302 bq_connection : str ,
301303):
302304 bigframes .options .experiments .blob = True
303305
304- actual_exploded = (
306+ actual = (
305307 pdf_mm_df ["pdf" ]
306- .blob .pdf_extract (connection = bq_connection , verbose = True )
307- .struct . explode ()
308+ .blob .pdf_extract (connection = bq_connection , verbose = verbose )
309+ .explode ()
308310 .to_pandas ()
309311 )
310- # content = [
311- # None,
312- # '"Sample PDF This is a testing file. Some dummy messages are used for testing purposes. "',
313- # ]
314- expected_df = pd .DataFrame (
315- [
316- {"status" : '"File has not been decrypted"' , "content" : None },
317- {
318- "status" : None ,
319- "content" : '"Sample PDF This is a testing file. Some dummy messages are used for testing purposes. "' ,
320- },
321- ]
322- )
323- actual = pd .DataFrame (
312+
313+ extract_data = [
314+ {"status" : "File has not been decrypted" , "content" : "" },
324315 {
325- "values" : actual_exploded .to_dict ("records" ),
326- }
327- )
316+ "status" : "" ,
317+ "content" : "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. " ,
318+ },
319+ ]
320+ if verbose :
321+ expected = pd .Series (extract_data )
322+ else :
323+ content_values = [item ["content" ] for item in extract_data ]
324+ expected = pd .Series (content_values )
328325
329- pd .testing .assert_frame_equal (
326+ pd .testing .assert_series_equal (
330327 actual ,
331- expected_df ,
328+ expected ,
332329 check_dtype = False ,
333- check_index_type = False ,
330+ check_index = False ,
331+ check_names = False ,
334332 )
335333
336334
337- def test_blob_pdf_chunk (pdf_mm_df : bpd .DataFrame , bq_connection : str ):
335+ @pytest .mark .parametrize ("verbose" , (True , False ))
336+ def test_blob_pdf_chunk (pdf_mm_df : bpd .DataFrame , verbose : bool , bq_connection : str ):
338337 bigframes .options .experiments .blob = True
339338
340- actual_exploded = (
341- pdf_mm_df ["pdf" ]
342- .blob .pdf_chunk (connection = bq_connection , chunk_size = 50 , overlap_size = 10 )
343- .struct .explode ()
344- .to_pandas ()
339+ actual = pdf_mm_df ["pdf" ].blob .pdf_chunk (
340+ connection = bq_connection , chunk_size = 50 , overlap_size = 10 , verbose = verbose
345341 )
346- extract_data = [
347- {"status" : '"File has not been decrypted"' , "content" : []},
342+
343+ chunk_data = [
344+ {"status" : "File has not been decrypted" , "content" : []},
348345 {
349- "status" : '""' ,
346+ "status" : "" ,
350347 "content" : [
351348 "Sample PDF This is a testing file. Some " ,
352349 "dummy messages are used for testing " ,
353350 "purposes. " ,
354351 ],
355352 },
356353 ]
357- expected_df = pd .DataFrame (
358- {
359- "pdf" : pdf_uris ,
360- "authorizer" : [bq_connection .casefold (), bq_connection .casefold ()],
361- "values" : extract_data ,
362- }
363- )
364- actual = pd .DataFrame (
365- {
366- "pdf" : pdf_uris ,
367- "authorizer" : [bq_connection .casefold (), bq_connection .casefold ()],
368- "values" : actual_exploded .to_dict ("records" ),
369- }
370- )
371-
372- pd .testing .assert_frame_equal (
373- actual ,
374- expected_df ,
354+ if verbose :
355+ actual = actual .explode ()
356+ expected = pd .Series (chunk_data )
357+ else :
358+ content_values = [item ["content" ] for item in chunk_data ]
359+ expected = pd .Series (content_values )
360+ pd .testing .assert_series_equal (
361+ actual .to_pandas (),
362+ expected ,
375363 check_dtype = False ,
376- check_index_type = False ,
364+ check_index = False ,
365+ check_names = False ,
377366 )
0 commit comments