3030    DataCollatorWithFlattening ,
3131    DataCollatorWithPadding ,
3232    default_data_collator ,
33-     is_torch_available ,
3433    is_tf_available ,
34+     is_torch_available ,
3535    set_seed ,
3636)
3737from  transformers .testing_utils  import  require_torch 
@@ -531,9 +531,7 @@ def test_data_collator_for_whole_word_mask(self):
531531
532532        input_tokens  =  [f"token_{ i }  "  for  i  in  range (8 )]
533533        tokenizer .add_tokens (input_tokens )
534-         features  =  [
535-             tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )
536-         ]
534+         features  =  [tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )]
537535
538536        data_collator  =  DataCollatorForWholeWordMask (tokenizer , return_tensors = "pt" )
539537
@@ -543,8 +541,7 @@ def test_data_collator_for_whole_word_mask(self):
543541
544542        # Features can already be tensors 
545543        features  =  [
546-             tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ).convert_to_tensors ("np" ) 
547-             for  _  in  range (2 )
544+             tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ).convert_to_tensors ("np" ) for  _  in  range (2 )
548545        ]
549546        batch  =  data_collator (features )
550547        self .assertEqual (batch ["input_ids" ].shape , (2 , 10 ))
@@ -553,7 +550,7 @@ def test_data_collator_for_whole_word_mask(self):
553550        if  is_torch_available ():
554551            # Features can already be tensors 
555552            features  =  [
556-                 tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ).convert_to_tensors ("pt" )  
553+                 tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ).convert_to_tensors ("pt" )
557554                for  _  in  range (2 )
558555            ]
559556            data_collator  =  DataCollatorForWholeWordMask (tokenizer , return_tensors = "pt" )
@@ -563,9 +560,10 @@ def test_data_collator_for_whole_word_mask(self):
563560
564561        if  is_tf_available ():
565562            import  tensorflow  as  tf 
563+ 
566564            # Features can already be tensors 
567565            features  =  [
568-                 tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ).convert_to_tensors ("tf" )  
566+                 tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ).convert_to_tensors ("tf" )
569567                for  _  in  range (2 )
570568            ]
571569            data_collator  =  DataCollatorForWholeWordMask (tokenizer , return_tensors = "tf" )
@@ -578,9 +576,7 @@ def test_data_collator_for_whole_word_mask_with_seed(self):
578576
579577        input_tokens  =  [f"token_{ i }  "  for  i  in  range (998 )]
580578        tokenizer .add_tokens (input_tokens )
581-         features  =  [
582-             tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )
583-         ]
579+         features  =  [tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )]
584580
585581        # check if seed is respected between two different DataCollatorForWholeWordMask instances 
586582        data_collator  =  DataCollatorForWholeWordMask (tokenizer , seed = 42 , return_tensors = "np" )
@@ -598,9 +594,7 @@ def test_data_collator_for_whole_word_mask_with_seed(self):
598594
599595        # check if seed is respected in multiple workers situation 
600596        if  is_torch_available ():
601-             features  =  [
602-                 tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (10 )
603-             ]
597+             features  =  [tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (10 )]
604598            dataloader  =  torch .utils .data .DataLoader (
605599                features ,
606600                batch_size = 2 ,
@@ -975,17 +969,13 @@ def test_whole_world_masking_collator_immutability(self):
975969
976970        input_tokens  =  [f"token_{ i }  "  for  i  in  range (8 )]
977971        tokenizer .add_tokens (input_tokens )
978-         original_data  =  [
979-             tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )
980-         ]
972+         original_data  =  [tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )]
981973        for  feature  in  original_data :
982-             feature [' labels'  ] =  (1 ,)
974+             feature [" labels"  ] =  (1 ,)
983975
984-         batch_data  =  [
985-             tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )
986-         ]
976+         batch_data  =  [tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )]
987977        for  feature  in  batch_data :
988-             feature [' labels'  ] =  (1 ,)
978+             feature [" labels"  ] =  (1 ,)
989979
990980        whole_word_masking_collator  =  DataCollatorForWholeWordMask (tokenizer )
991981
@@ -1450,9 +1440,7 @@ def test_data_collator_for_whole_word_mask(self):
14501440
14511441        input_tokens  =  [f"token_{ i }  "  for  i  in  range (8 )]
14521442        tokenizer .add_tokens (input_tokens )
1453-         features  =  [
1454-             tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )
1455-         ]
1443+         features  =  [tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )]
14561444
14571445        batch  =  data_collator (features )
14581446        self .assertEqual (batch ["input_ids" ].shape , (2 , 10 ))
@@ -1471,9 +1459,7 @@ def test_data_collator_for_whole_word_mask_with_seed(self):
14711459
14721460        input_tokens  =  [f"token_{ i }  "  for  i  in  range (998 )]
14731461        tokenizer .add_tokens (input_tokens )
1474-         features  =  [
1475-             tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )
1476-         ]
1462+         features  =  [tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )]
14771463
14781464        # check if seed is respected between two different DataCollatorForWholeWordMask instances 
14791465        data_collator  =  DataCollatorForWholeWordMask (tokenizer , seed = 42 , return_tensors = "np" )
@@ -1816,17 +1802,13 @@ def test_whole_world_masking_collator_immutability(self):
18161802
18171803        input_tokens  =  [f"token_{ i }  "  for  i  in  range (8 )]
18181804        tokenizer .add_tokens (input_tokens )
1819-         original_data  =  [
1820-             tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )
1821-         ]
1805+         original_data  =  [tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )]
18221806        for  feature  in  original_data :
1823-             feature [' labels'  ] =  (1 ,)
1807+             feature [" labels"  ] =  (1 ,)
18241808
1825-         batch_data  =  [
1826-             tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )
1827-         ]
1809+         batch_data  =  [tokenizer (" " .join (input_tokens ), return_offsets_mapping = True ) for  _  in  range (2 )]
18281810        for  feature  in  batch_data :
1829-             feature [' labels'  ] =  (1 ,)
1811+             feature [" labels"  ] =  (1 ,)
18301812
18311813        whole_word_masking_collator  =  DataCollatorForWholeWordMask (tokenizer , return_tensors = "np" )
18321814
@@ -1902,3 +1884,98 @@ def test_sentence_order_prediction_collator_immutability(self):
19021884        self ._validate_original_data_against_collated_data (
19031885            collator = sop_collator , original_data = features_original , batch_data = features_batch 
19041886        )
1887+ 
1888+ 
1889+ class  DataCollatorForLanguageModelingUnitTest (unittest .TestCase ):
1890+     def  test__calc_word_ids_and_prob_mask (self ):
1891+         offsets  =  np .array (
1892+             [
1893+                 [(0 , 0 ), (0 , 3 ), (3 , 4 ), (5 , 6 ), (6 , 7 ), (8 , 9 )],
1894+                 [(0 , 0 ), (0 , 3 ), (3 , 4 ), (5 , 6 ), (6 , 7 ), (0 , 0 )],
1895+                 [(0 , 0 ), (0 , 3 ), (3 , 4 ), (0 , 0 ), (6 , 7 ), (0 , 0 )],
1896+                 [(1 , 2 ), (2 , 3 ), (3 , 4 ), (4 , 5 ), (5 , 6 ), (6 , 7 )],
1897+                 [(1 , 1 ), (2 , 2 ), (3 , 4 ), (5 , 6 ), (7 , 8 ), (9 , 10 )],
1898+                 [(0 , 0 ), (0 , 0 ), (0 , 0 ), (0 , 0 ), (0 , 0 ), (0 , 0 )],
1899+             ]
1900+         )
1901+ 
1902+         special_tokens_mask  =  np .array (
1903+             [
1904+                 [1 , 0 , 0 , 0 , 0 , 0 ],
1905+                 [1 , 0 , 0 , 0 , 0 , 1 ],
1906+                 [1 , 0 , 0 , 1 , 0 , 1 ],
1907+                 [0 , 0 , 0 , 0 , 0 , 0 ],
1908+                 [0 , 0 , 0 , 0 , 0 , 0 ],
1909+                 [1 , 1 , 1 , 1 , 1 , 1 ],
1910+             ]
1911+         )
1912+ 
1913+         output_word_ids , output_prob_mask  =  DataCollatorForLanguageModeling ._calc_word_ids_and_prob_mask (
1914+             offsets , special_tokens_mask 
1915+         )
1916+ 
1917+         expected_word_ids  =  np .array (
1918+             [
1919+                 [- 1 , 1 , 1 , 2 , 2 , 3 ],
1920+                 [- 1 , 1 , 1 , 2 , 2 , - 1 ],
1921+                 [- 1 , 1 , 1 , - 1 , 2 , - 1 ],
1922+                 [1 , 1 , 1 , 1 , 1 , 1 ],
1923+                 [1 , 2 , 3 , 4 , 5 , 6 ],
1924+                 [- 1 , - 1 , - 1 , - 1 , - 1 , - 1 ],
1925+             ]
1926+         )
1927+ 
1928+         expected_prob_mask  =  np .array (
1929+             [
1930+                 [1 , 0 , 1 , 0 , 1 , 0 ],
1931+                 [1 , 0 , 1 , 0 , 1 , 1 ],
1932+                 [1 , 0 , 1 , 1 , 0 , 1 ],
1933+                 [0 , 1 , 1 , 1 , 1 , 1 ],
1934+                 [0 , 0 , 0 , 0 , 0 , 0 ],
1935+                 [1 , 1 , 1 , 1 , 1 , 1 ],
1936+             ]
1937+         )
1938+ 
1939+         np .testing .assert_array_equal (output_word_ids , expected_word_ids )
1940+         np .testing .assert_array_equal (output_prob_mask , expected_prob_mask )
1941+ 
1942+     def  test__whole_word_mask (self ):
1943+         word_ids  =  np .array (
1944+             [
1945+                 [- 1 , 1 , 1 , 2 , 2 , 3 ],
1946+                 [- 1 , 1 , 1 , 2 , 2 , - 1 ],
1947+                 [- 1 , 1 , 1 , - 1 , 2 , - 1 ],
1948+                 [1 , 1 , 1 , 1 , 1 , 1 ],
1949+                 [1 , 2 , 3 , 4 , 5 , 6 ],
1950+                 [1 , 2 , 3 , 4 , 5 , 6 ],
1951+                 [- 1 , - 1 , - 1 , - 1 , - 1 , - 1 ],
1952+             ]
1953+         )
1954+ 
1955+         mask  =  np .array (
1956+             [
1957+                 [0 , 1 , 0 , 0 , 0 , 0 ],
1958+                 [0 , 1 , 0 , 1 , 0 , 0 ],
1959+                 [0 , 0 , 0 , 0 , 1 , 0 ],
1960+                 [1 , 0 , 0 , 0 , 0 , 0 ],
1961+                 [0 , 0 , 0 , 0 , 0 , 0 ],
1962+                 [0 , 1 , 0 , 1 , 0 , 1 ],
1963+                 [0 , 0 , 0 , 0 , 0 , 0 ],
1964+             ]
1965+         ).astype (bool )
1966+ 
1967+         output_mask  =  DataCollatorForLanguageModeling ._whole_word_mask (word_ids , mask )
1968+ 
1969+         expected_mask  =  np .array (
1970+             [
1971+                 [0 , 1 , 1 , 0 , 0 , 0 ],
1972+                 [0 , 1 , 1 , 1 , 1 , 0 ],
1973+                 [0 , 0 , 0 , 0 , 1 , 0 ],
1974+                 [1 , 1 , 1 , 1 , 1 , 1 ],
1975+                 [0 , 0 , 0 , 0 , 0 , 0 ],
1976+                 [0 , 1 , 0 , 1 , 0 , 1 ],
1977+                 [0 , 0 , 0 , 0 , 0 , 0 ],
1978+             ]
1979+         ).astype (bool )
1980+ 
1981+         np .testing .assert_array_equal (output_mask , expected_mask )
0 commit comments