@@ -287,19 +287,38 @@ def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection:
287287 assert actual .dtype == dtypes .BYTES_DTYPE
288288
289289
290- @pytest .fixture (scope = "session" )
291- def pdf_uris () -> list [str ]:
292- return [
293- "uri: gs://bigframes_blob_test/pdfs/test-protected.pdf" ,
294- "uri: gs://bigframes_blob_test/pdfs/sample-local-pdf.pdf" ,
295- ]
296-
297-
298- @pytest .mark .parametrize ("verbose" , (True , False ))
290+ @pytest .mark .parametrize (
291+ "verbose, expected" ,
292+ [
293+ (
294+ True ,
295+ pd .Series (
296+ [
297+ {"status" : "File has not been decrypted" , "content" : "" },
298+ {
299+ "status" : "" ,
300+ "content" : "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. " ,
301+ },
302+ ]
303+ ),
304+ ),
305+ (
306+ False ,
307+ pd .Series (
308+ [
309+ "" ,
310+ "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. " ,
311+ ],
312+ name = "pdf" ,
313+ ),
314+ ),
315+ ],
316+ )
299317def test_blob_pdf_extract (
300318 pdf_mm_df : bpd .DataFrame ,
301319 verbose : bool ,
302320 bq_connection : str ,
321+ expected : pd .Series ,
303322):
304323 bigframes .options .experiments .blob = True
305324
@@ -310,57 +329,63 @@ def test_blob_pdf_extract(
310329 .to_pandas ()
311330 )
312331
313- extract_data = [
314- {"status" : "File has not been decrypted" , "content" : "" },
315- {
316- "status" : "" ,
317- "content" : "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. " ,
318- },
319- ]
320- if verbose :
321- expected = pd .Series (extract_data )
322- else :
323- content_values = [item ["content" ] for item in extract_data ]
324- expected = pd .Series (content_values )
325-
326332 pd .testing .assert_series_equal (
327333 actual ,
328334 expected ,
329335 check_dtype = False ,
330336 check_index = False ,
331- check_names = False ,
332337 )
333338
334339
335- @pytest .mark .parametrize ("verbose" , (True , False ))
336- def test_blob_pdf_chunk (pdf_mm_df : bpd .DataFrame , verbose : bool , bq_connection : str ):
340+ @pytest .mark .parametrize (
341+ "verbose, expected" ,
342+ [
343+ (
344+ True ,
345+ pd .Series (
346+ [
347+ {"status" : "File has not been decrypted" , "content" : []},
348+ {
349+ "status" : "" ,
350+ "content" : [
351+ "Sample PDF This is a testing file. Some " ,
352+ "dummy messages are used for testing " ,
353+ "purposes. " ,
354+ ],
355+ },
356+ ]
357+ ),
358+ ),
359+ (
360+ False ,
361+ pd .Series (
362+ [
363+ pd .NA ,
364+ "Sample PDF This is a testing file. Some " ,
365+ "dummy messages are used for testing " ,
366+ "purposes. " ,
367+ ],
368+ ),
369+ ),
370+ ],
371+ )
372+ def test_blob_pdf_chunk (
373+ pdf_mm_df : bpd .DataFrame , verbose : bool , bq_connection : str , expected : pd .Series
374+ ):
337375 bigframes .options .experiments .blob = True
338376
339- actual = pdf_mm_df ["pdf" ].blob .pdf_chunk (
340- connection = bq_connection , chunk_size = 50 , overlap_size = 10 , verbose = verbose
377+ actual = (
378+ pdf_mm_df ["pdf" ]
379+ .blob .pdf_chunk (
380+ connection = bq_connection , chunk_size = 50 , overlap_size = 10 , verbose = verbose
381+ )
382+ .explode ()
383+ .to_pandas ()
341384 )
342385
343- chunk_data = [
344- {"status" : "File has not been decrypted" , "content" : []},
345- {
346- "status" : "" ,
347- "content" : [
348- "Sample PDF This is a testing file. Some " ,
349- "dummy messages are used for testing " ,
350- "purposes. " ,
351- ],
352- },
353- ]
354- if verbose :
355- actual = actual .explode ()
356- expected = pd .Series (chunk_data )
357- else :
358- content_values = [item ["content" ] for item in chunk_data ]
359- expected = pd .Series (content_values )
360386 pd .testing .assert_series_equal (
361- actual . to_pandas () ,
387+ actual ,
362388 expected ,
363389 check_dtype = False ,
364390 check_index = False ,
365- check_names = False ,
366391 )
0 commit comments