Skip to content

Commit

Permalink
pona: o kepeken ala ilo pi suli open; o pali e ilo pi sona suli lon p…
Browse files Browse the repository at this point in the history
…oki Config
  • Loading branch information
gregdan3 committed Oct 16, 2024
1 parent 38a78d1 commit ec8de13
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 14 deletions.
34 changes: 20 additions & 14 deletions src/sonatoki/Configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@
from sonatoki.Filters import (
Or,
And,
Len,
Not,
Filter,
PuName,
Numeric,
Syllabic,
NimiUCSUR,
Alphabetic,
NimiKuLili,
NimiKuSuli,
ProperName,
Punctuation,
LongSyllabic,
Miscellaneous,
Expand All @@ -29,7 +32,7 @@
NimiLinkuUncommon,
FalsePosAlphabetic,
)
from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
from sonatoki.Scorers import Scorer, Soften, Voting, PassFail, SoftScaling, SoftPassFail
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
from sonatoki.Preprocessors import (
Expand Down Expand Up @@ -62,8 +65,8 @@
"we", # 1st person plural, english
"wi", # wii and discussions of syllables
"sole", # singular, of shoe
"omen", # ominous
# unexplored candidates for removal
# "omen", # ominous
# "papa", # father
# "lo", # "lo" and "loo"
# "ewe", # sheep
Expand Down Expand Up @@ -99,11 +102,11 @@ class IloConfig(TypedDict):
"cleaners": [ConsecutiveDuplicates],
"ignoring_filters": [Numeric, Punctuation],
"scoring_filters": [
Or(NimiLinkuByUsage(30), NimiUCSUR),
And(LongSyllabic, Not(FalsePosSyllabic)),
Len(Or(NimiLinkuByUsage(30), NimiUCSUR), max=15),
Len(And(Syllabic, Not(FalsePosSyllabic)), min=3, max=24),
# NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
LongProperName,
And(LongAlphabetic, Not(FalsePosAlphabetic)),
Len(ProperName, min=2, max=24),
Len(And(Alphabetic, Not(FalsePosAlphabetic)), min=3, max=24),
],
"scorer": SoftScaling,
"passing_score": 0.8,
Expand All @@ -114,15 +117,18 @@ class IloConfig(TypedDict):
"cleaners": [ConsecutiveDuplicates],
"ignoring_filters": [Numeric, Punctuation],
"scoring_filters": [
Or(
# awkward but efficient syntax
NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
NimiUCSUR,
Miscellaneous,
Len(
Or(
# awkward but efficient syntax
NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
NimiUCSUR,
Miscellaneous,
),
max=19,
),
And(LongSyllabic, Not(FalsePosSyllabic)),
LongProperName,
And(LongAlphabetic, Not(FalsePosAlphabetic)),
Len(And(Syllabic, Not(FalsePosSyllabic)), min=3, max=24),
Len(ProperName, min=2, max=24),
Len(And(Alphabetic, Not(FalsePosAlphabetic)), min=3, max=24),
],
"scorer": SoftScaling,
"passing_score": 0.8,
Expand Down
2 changes: 2 additions & 0 deletions tests/test_ilo.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def corpus_ilo() -> Ilo:
"ni li sona kiwen",
"nimi namako li toki e ale",
"mi open mute a", # mostly eng words
"mi pali ilo to",
]

IGNORABLES = [
Expand Down Expand Up @@ -201,6 +202,7 @@ def corpus_ilo() -> Ilo:
"poan",
"mtue",
"mi nasa B^)", # emoticon
"musi :P", # emoticon
"lete li ike x.x", # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
"😃⃢👍", # sincerely, no idea, but it came up and it should be omitted by emojis but isn't
]
Expand Down

0 comments on commit ec8de13

Please sign in to comment.