diff --git a/CHANGELOG.md b/CHANGELOG.md index a9a618d2..2c26027f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ Unreleased - Added Kashmiri (`kas`). (\#431) - Added Malayalam (`mal`). (\#434) - Added Dhivehi (`div`). (\#437) +- Added Akkadian (`akk`). (\#441) - Added Central Nahuatl (`nhn`). (\#443) - Added Etruscan (`ett`). (\#444) - Added Gujarati (`guj`). (\#445) diff --git a/data/scrape/README.md b/data/scrape/README.md index 7fe38e74..e1c87ce1 100644 --- a/data/scrape/README.md +++ b/data/scrape/README.md @@ -10,6 +10,7 @@ | [TSV](tsv/afr_latn_broad_filtered.tsv) | afr | Afrikaans | Afrikaans | Latin | | True | Broad | True | 1,659 | | [TSV](tsv/afr_latn_narrow.tsv) | afr | Afrikaans | Afrikaans | Latin | | False | Narrow | True | 121 | | [TSV](tsv/ajp_arab_broad.tsv) | ajp | South Levantine Arabic | South Levantine Arabic | Arabic | | False | Broad | False | 155 | +| [TSV](tsv/akk_latn_broad.tsv) | akk | Akkadian | Akkadian | Latin | | False | Broad | True | 199 | | [TSV](tsv/alb_latn_broad.tsv) | alb | Albanian | Albanian | Latin | | False | Broad | True | 1,450 | | [TSV](tsv/alb_latn_narrow.tsv) | alb | Albanian | Albanian | Latin | | False | Narrow | True | 823 | | [TSV](tsv/ale_latn_broad.tsv) | ale | Aleut | Aleut | Latin | | False | Broad | True | 104 | diff --git a/data/scrape/lib/languages.json b/data/scrape/lib/languages.json index a9b3c106..c62f70de 100644 --- a/data/scrape/lib/languages.json +++ b/data/scrape/lib/languages.json @@ -38,6 +38,16 @@ "wiktionary_code": "ain", "casefold": true }, + "akk": { + "iso639_name": "Akkadian", + "wiktionary_name": "Akkadian", + "wiktionary_code": "akk", + "casefold": true, + "script": { + "latn": "Latin", + "xsux": "Cuneiform" + } + }, "alb": { "iso639_name": "Albanian", "wiktionary_name": "Albanian", diff --git a/data/scrape/tsv/akk_latn_broad.tsv b/data/scrape/tsv/akk_latn_broad.tsv new file mode 100644 index 00000000..60a6a951 --- /dev/null +++ b/data/scrape/tsv/akk_latn_broad.tsv @@ -0,0 +1,199 @@ +abnum a b n u m +abum a b u m +adi a d i +agammum a ɡ a m m u m +akalum a k a l u m +aklum a k l u m +akālum a k aː l u m +alaktum a l a k t u m +alpum a l p u m +alākum a l aː k u m +amtum a m t u m +amurdinnum a m u r d i n n u m +ana a n a +annakum a n n a k u m +anāku a n aː k u +aplum a p l u m +aptum a p t u m +aradum a r a d u m +ardu a r d u +ardum a r d u m +ariktum a r i k t u m +arārum a r aː r u m +asum a s u m +atta a t t a +atti a t t i +awātum a w aː t u m +awīltum a w iː l t u m +awīlum a w iː l u m +aššatum a ʃ ʃ a t u m +aḫum a χ u m +aḫûm a χ uː m +aḫātum a χ aː t u m +bardu b a r d u +bardum b a r d u m +birkum b i r k u m +bābilim b aː b i l i m +bābum b aː b u m +bēltum b eː l t u m +bēlum b eː l u m +bītum b iː t u m +būrum b uː r u m +daltum d a l t u m +damāqum d a m aː q u m +danānum d a n aː n u m +dayyānum d a j j aː n u m +dārum d aː r u m +eleppum e l e p p u m +ellum e l l u m +elēpum e l eː p u m +emūqum e m uː q u m +eperum e p e r u m +erbum e r b u m +erebum e r e b u m +ergilum e r ɡ i l u m +erēšum e r eː ʃ u m +erṣetum e r sˤ e t u m +gallûm ɡ a l l uː m +gišimmarum ɡ i ʃ i m m a r u m +idum i d u m +ilkum i l k u m +iltum i l t u m +ilu i l u +ilum i l u m +ilānū i l aː n uː +ilū i l uː +imērum i m eː r u m +in i n +ina i n a +irgilum i r ɡ i l u m +itti i t t i +išarum i ʃ a r u m +išdum i ʃ d u m +ištar i ʃ t a r +ištu i ʃ t u +ištēn i ʃ t eː n +iṣum i sˤ u m +iṣṣūrum i sˤ sˤ uː r u m +jâti j aː t i +kakkabum k a k k a b u m +kalbum k a l b u m +kankum k a n k u m +kanākum k a n aː k u m +kanīkum k a n iː k u m +kappum k a p p u m +kaprum k a p r u m +kaspum k a s p u m +kašādum k a ʃ aː d u m +kirium k i r i u m +kirû k i r uː +kirûm k i r uː m +kunukkum k u n u k k u m +kāsum k aː s u m +lamassum l a m a s s u m +libbum l i b b u m +lûm l uː m +lā l aː +līmum l iː m u m +lītum l iː t u m +marduk m a r d u k +marrum m a r r u m +matqum m a t q u m +matāqum m a t aː q u m +mazzaztum m a z z a z t u m +midrum m i d r u m +musukkannum m u s u k k a n n u m +mutqûm m u t q uː m +mutum m u t u m +muškēnum m u ʃ k eː n u m +mû m uː +mārtum m aː r t u m +mārum m aː r u m +mātum m aː t u m +mēsum m eː s u m +nadānum n a d aː n u m +nagbum n a ɡ b u m +nārum n aː r u m +nīšum n iː ʃ u m +parzillum p a r z i l l u m +parāsum p a r aː s u m +paššūrum p a ʃ ʃ uː r u m +pûm p uː m +pānum p aː n u m +qabûm q a b uː m +qanûm q a n uː m +qaqqadum q a q q a d u m +qaqqarum q a q q a r u m +qarnum q a r n u m +rēšum r eː ʃ u m +sinništum s i n n i ʃ t u m +sīḫum s iː χ u m +u u +ugārum u ɡ aː r u m +ul u l +ummu u m m u +ummum u m m u m +urarṭu u r a r tˤ u +urdu u r d u +urdum u r d u m +urrum u r r u m +uznu u z n u +uznum u z n u m +waklum w a k l u m +walādum w a l aː d u m +wardu w a r d u +wardum w a r d u m +warāqum w a r aː q u m +warḫum w a r χ u m +yâti j aː t i +zikarum z i k a r u m +zikrum z i k r u m +ziqqurratum z i q q u r r a t u m +zubbum z u b b u m +zērum z eː r u m +zību z iː b u +ālikum aː l i k u m +ālum aː l u m +ēkallum eː k a l l u m +ēnum eː n u m +īkum iː k u m +īnum iː n u m +ša ʃ a +šamaš ʃ a m a ʃ +šamaššammum ʃ a m a ʃ ʃ a m m u m +šammum ʃ a m m u m +šamnum ʃ a m n u m +šamû ʃ a m uː +šamšum ʃ a m ʃ u m +šarqum ʃ a r q u m +šarratum ʃ a r r a t u m +šarrum ʃ a r r u m +šarrāqum ʃ a r r aː q u m +šarāqum ʃ a r aː q u m +šattum ʃ a t t u m +šinnum ʃ i n n u m +šinā ʃ i n aː +šumma ʃ u m m a +šumum ʃ u m u m +šārtum ʃ aː r t u m +šēdum ʃ eː d u m +šī ʃ iː +šīmtum ʃ iː m t u m +šīpātum ʃ iː p aː t u m +šū ʃ uː +šūmū ʃ uː m uː +ūmum uː m u m +ḫalāqum χ a l aː q u m +ḫaṣṣinnum χ a sˤ sˤ i n n u m +ḫurāṣum χ u r aː sˤ u m +ṣerretum sˤ e r r e t u m +ṣeḫrum sˤ e χ r u m +ṣeḫērum sˤ e χ eː r u m +ṣuprum sˤ u p r u m +ṣuḫārtum sˤ u χ aː r t u m +ṣuḫārum sˤ u χ aː r u m +ṣābum sˤ aː b u m +ṭabtum tˤ a b t u m +ṭuppum tˤ u p p u m +ṭupšarrum tˤ u p ʃ a r r u m +ṭābum tˤ aː b u m diff --git a/data/scrape/tsv_summary.tsv b/data/scrape/tsv_summary.tsv index 20022cef..4391966e 100644 --- a/data/scrape/tsv_summary.tsv +++ b/data/scrape/tsv_summary.tsv @@ -8,6 +8,7 @@ afr_latn_broad.tsv afr Afrikaans Afrikaans Latin False Broad True 1685 afr_latn_broad_filtered.tsv afr Afrikaans Afrikaans Latin True Broad True 1659 afr_latn_narrow.tsv afr Afrikaans Afrikaans Latin False Narrow True 121 ajp_arab_broad.tsv ajp South Levantine Arabic South Levantine Arabic Arabic False Broad False 155 +akk_latn_broad.tsv akk Akkadian Akkadian Latin False Broad True 199 alb_latn_broad.tsv alb Albanian Albanian Latin False Broad True 1450 alb_latn_narrow.tsv alb Albanian Albanian Latin False Narrow True 823 ale_latn_broad.tsv ale Aleut Aleut Latin False Broad True 104 diff --git a/tests/test_data/test_split.py b/tests/test_data/test_split.py index b31190fa..1aa5a0be 100644 --- a/tests/test_data/test_split.py +++ b/tests/test_data/test_split.py @@ -202,6 +202,15 @@ ("wikipron", False), ], ), + SmokeTestScript( + "Cuneiform", + [ + ("𒄑𒉿𒌆", True), + ("𒁲𒋻", True), + ("جܡ", False), + ("wikipron", False), + ], + ), ]