diff --git a/README.md b/README.md index 3900d37..0219c52 100644 --- a/README.md +++ b/README.md @@ -141,12 +141,12 @@ flowchart TD direction LR H1[Consonant added before second vowel] ~~~ H2["'X' added after first vowel"] end - subgraph c2 [Two consonants between vowels] + subgraph c2 [Two consonants between vowels (5)] direction LR H3["Consonants are consonant cluster (4)"] -->|Yes| H4[Cluster added before second vowel] H3 -->|No| H5[First consonant added after first vowel, \n second consonant added before second vowel] end - subgraph c3 [Three-plus consonants between vowels] + subgraph c3 [Three-plus consonants between vowels (5)] direction LR H6["First two consonants are consonant cluster"] -->|Yes| H7[Cluster added after first vowel, \n remaining consosnants added before second vowel] H6 -->|No| H8[First consonant added after first vowel, \n remaining consonants added before second vowel] @@ -183,11 +183,12 @@ flowchart TD - "ch", "ph", "th", and "rh" - "gn" - "qu" and "gu" (when "u" serving as semivowel) - - "nc" - - "mp" - - "sc" + - "sc" and "st" - "p", "b", "t", "d", "c", "f", or "g" + "l" - "p", "b", "t", "d", "c", "f", or "g" + "r" + - "str" + +*Note (5)*: Nasalized consonants ("m" and "n") "attach" to the previous vowel when they begin sequences of two or more consonants. The remaining consonants are then treated as if they were the only consonants between vowels. For example, if the three-consonant sequence "mpr" occurs between two vowel groups, the "m" attaches to the preceding vowel group, and the remaining consonants "pr" are treated according by the logic of a two-consonant sequence between vowel groups. Certain exceptions to these general rules will occur. The module provides a means of overriding the default syllabification for a specific word in `cantus_text_syllabification.py`. Exceptions can be added to the `EXCEPTIONS_DICT` dictionary. diff --git a/tests/word_syllabification_tests.csv b/tests/word_syllabification_tests.csv index 23d550b..d6e7973 100644 --- a/tests/word_syllabification_tests.csv +++ b/tests/word_syllabification_tests.csv @@ -32,7 +32,7 @@ conscientiam,con-sci-en-ti-am, monstrat,mon-strat, brachium,bra-chi-um, uulto,uul-to, -xpistus,xpis-tus, +xpistus,xpi-stus, yesse,yes-se, languorem,lan-guo-rem, coniugem,con-iu-gem, @@ -44,11 +44,11 @@ iniunxit,in-iun-xit, coniunctos,con-iunc-tos, ihericho,ihe-ri-co, extinguere,ex-tin-gue-re, -iniusticias,in-ius-ti-ci-as, +iniusticias,in-iu-sti-ci-as, unguenti,un-guen-ti, unguebat,un-gue-bat, adiuvemur,ad-iu-ve-mur, -subiecisti,sub-ie-cis-ti, +subiecisti,sub-ie-ci-sti, adiutorium,ad-iu-to-ri-um, mercenarijs,mer-ce-na-ri-js, iherosolimam,ihe-ro-so-li-mam, @@ -65,8 +65,8 @@ constitui,con-sti-tu-i, apposuit,ap-po-su-it, iubente,iu-ben-te, tuis,tu-is, -maiestatis,ma-ies-ta-tis, -iusticie,ius-ti-ci-e, +maiestatis,ma-ie-sta-tis, +iusticie,iu-sti-ci-e, iudeos,iu-de-os, iudex,iu-dex, polacuit,po-la-cu-it, @@ -78,7 +78,7 @@ dei,de-i, iudicantem,iu-di-can-tem, voluit,vo-lu-it, fidei,fi-de-i, -fuisti,fu-is-ti, +fuisti,fu-i-sti, iam,iam, iacula,ia-cu-la, iubilemus,iu-bi-le-mus, @@ -87,4 +87,7 @@ in,in, maior,ma-ior, amplius,am-pli-us, adincresco,ad-in-cre-sco, -compressans,com-pres-sans \ No newline at end of file +compressans,com-pres-sans, +principem,prin-ci-pem, +redemptor,re-demp-tor, +imperator,im-pe-ra-tor \ No newline at end of file diff --git a/volpiano_display_utilities/latin_word_syllabification.py b/volpiano_display_utilities/latin_word_syllabification.py index 1a488ad..d27204a 100644 --- a/volpiano_display_utilities/latin_word_syllabification.py +++ b/volpiano_display_utilities/latin_word_syllabification.py @@ -36,8 +36,6 @@ "gn", "qu", "gu", - "nc", - "mp", "sc", "pl", "pr", @@ -51,8 +49,12 @@ "fr", "gl", "gr", + "st", } + +_NASALIZED_CONSONANTS: set = {"m", "n"} + # Prefix groups are groups of characters that serve as common prefixes. For details, # see README. _PREFIX_GROUPS: set = {"ab", "ob", "ad", "per", "sub", "in", "con"} @@ -203,11 +205,23 @@ def _get_syl_bound_position(ltrs_btw_vow_grps: str) -> Tuple[int, str]: in which case we split as [vowel] + [i + vowel] 2. 1 consonant between vowel groups: keep the syllable boundary where it is (consonant is part of second syllable) - 3. 2 consonants between vowel groups: split the first consonant to the + 3. 2 consonants between vowel groups: split the first consonant to the first syllable, unless the two consonants form a consonant group, in - which case keep the group on the second syllable - 4. 3+ consonants between vowel groups: add the first consonant or - consonant group to the first syllable + which case keep the group on the second syllable. + 4. 3+ consonants between vowel groups: group the final two consonants of + a 3-consonant sequence between vowel groups, if possible, and place preceding + consonants in the preceding syllable. If these cannot be grouped or there + are more than three consonants between vowel groups, group the + first two consonants, if possible, and add following consonants to the + following syllable. If neither the final two nor first two consonants can + be grouped, split the syllable after the first consonant. + + EXCEPTION: If the first consonant of a sequence of 2 or more consonants between + vowels is a nasalized consonant ("m" or "n"), we don't treat it as a consonant + for the purposes of the cases above. In practice, this means we only need to check + for the existence of a nasalized consonant at the start of a sequence of 3 or more + consonants between vowels (in the two consonant case, an initial "m" or "n" in the + sequence is already added to the preceding syllable). Two additional special cases exist. "X" is treated as a double consonant "ks" and the letter terminates the previous syllable. In cases where "i" @@ -223,36 +237,45 @@ def _get_syl_bound_position(ltrs_btw_vow_grps: str) -> Tuple[int, str]: and split_case is a string describing the case used to determine the syllable boundary (passed to logger). """ - num_ltrs_btw_vow_grps = len(ltrs_btw_vow_grps) + num_ltrs_btw_vow_grps: int = len(ltrs_btw_vow_grps) # Default case: syllable boundary immediately follows previous # vowel group. - syl_bound = 0 if num_ltrs_btw_vow_grps == 0: - split_case = "Hiatus" - elif ltrs_btw_vow_grps[0] == "x": - syl_bound = 1 - split_case = "X is double consonant" - elif num_ltrs_btw_vow_grps == 1: - split_case = "1 consonant between vowels" - elif num_ltrs_btw_vow_grps == 2: + return 0, "Hiatus" + if ltrs_btw_vow_grps[0] == "x": + return 1, "X is double consonant" + if num_ltrs_btw_vow_grps == 1: + return 0, "1 consonant between vowels" + # If the first letter of the consonant sequence is a nasalized consonant, + # we add it to the prior syllable and treat the remaining consonants + # as if they were the only consonants between the vowel groups. + num_consonants: int = num_ltrs_btw_vow_grps + if ltrs_btw_vow_grps[0] in _NASALIZED_CONSONANTS: + syl_bound: int = 1 + ltrs_btw_vow_grps = ltrs_btw_vow_grps[1:] + num_ltrs_btw_vow_grps -= 1 + # If there is only one consonant remaining, we treat it as the only + # consonant between the vowel groups and add it to the following syllable. + if num_ltrs_btw_vow_grps == 1: + return syl_bound, "2 consonants between vowels" + else: + syl_bound = 0 + if num_ltrs_btw_vow_grps == 2: if ltrs_btw_vow_grps not in _CONSONANT_GROUPS: - syl_bound = 1 - split_case = "2 consonants between vowels" + syl_bound += 1 + split_case: str = f"{num_consonants} consonants between vowels" else: - split_case = "2 consonants between vowels (consonant group)" + split_case = f"{num_consonants} consonants (consonant group) between vowels" + elif ltrs_btw_vow_grps == "str": + split_case = f"{num_consonants} consonants ('str' group) between vowels" + elif ltrs_btw_vow_grps[1:] in _CONSONANT_GROUPS: + syl_bound += 1 + split_case = f"{num_consonants} consonants (consonant group) between vowels" + elif ltrs_btw_vow_grps[:2] in _CONSONANT_GROUPS: + split_case = f"{num_consonants} consonants (consonant group) between vowels" else: - # in situations where 3 or more consonants are between consecutive vowels, - # group the final two consonants, if possible (amplius -> am-pl-ius). If not, - # group the first two consonants (coniunctos -> con-iunc-tos), if possible. If - # neither the final two nor first two consonants can be grouped, split after - # the first consonant. - if ltrs_btw_vow_grps[1:] in _CONSONANT_GROUPS: - syl_bound = num_ltrs_btw_vow_grps - 2 - elif ltrs_btw_vow_grps[:2] in _CONSONANT_GROUPS: - syl_bound = 2 - else: - syl_bound = 1 - split_case = "3+ consonants between vowels" + syl_bound += 1 + split_case = f"{num_consonants} consonants between vowels" return syl_bound, split_case