Skip to content

Commit

Permalink
ISO-8859-1 now transcodes to UTF-8 (#112)
Browse files Browse the repository at this point in the history
  • Loading branch information
dharple committed Mar 31, 2024
1 parent e8141aa commit e9434d4
Show file tree
Hide file tree
Showing 6 changed files with 365 additions and 221 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

### Changed

- The translation table for CP-1252 now transcodes characters to UTF-8. [#112]
- The translation tables for CP-1252 and ISO-8859-1 now transcode characters to
UTF-8. [#112]

## [2.0.0] - 2024-03-30
### Added
Expand Down
142 changes: 142 additions & 0 deletions bin/make-iso8859-1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/usr/bin/env bash
#
# This script generates the CP-1252 translation table based on a table found on
# Wikipedia which lists Unicode equivalents for CP-1252 characters.
#

PROJECT_ROOT=$(dirname "$(dirname "$(realpath "$0")")")
TABLEPATH="$PROJECT_ROOT/table"

TABLE1=$TABLEPATH/unicode.tbl
TABLE2=$TABLEPATH/unidecode.tbl

START_HEX=0x00A0

#
# ISO-8859-1 Translation - 0x00A0-0x00FF
#

CHARS="
00A0
00A1
00A2
00A3
00A4
00A5
00A6
00A7
00A8
00A9
00AA
00AB
00AC
00AD
00AE
00AF
00B0
00B1
00B2
00B3
00B4
00B5
00B6
00B7
00B8
00B9
00BA
00BB
00BC
00BD
00BE
00BF
00C0
00C1
00C2
00C3
00C4
00C5
00C6
00C7
00C8
00C9
00CA
00CB
00CC
00CD
00CE
00CF
00D0
00D1
00D2
00D3
00D4
00D5
00D6
00D7
00D8
00D9
00DA
00DB
00DC
00DD
00DE
00DF
00E0
00E1
00E2
00E3
00E4
00E5
00E6
00E7
00E8
00E9
00EA
00EB
00EC
00ED
00EE
00EF
00F0
00F1
00F2
00F3
00F4
00F5
00F6
00F7
00F8
00F9
00FA
00FB
00FC
00FD
00FE
00FF
"

CURRENT=$(printf "%d" "$START_HEX")

for CHAR in $CHARS ; do
CURRENT_HEX=$(printf "0x%04X" "$CURRENT")

if [ "$CHAR" = "undef" ] ; then
echo "$CURRENT_HEX \"-\" # undefined, or control character"
else
CHECK=$(grep -c 0x"$CHAR" "$TABLE1")
if [ "$CHECK" -eq "1" ] ; then
echo -n "$CURRENT_HEX \"\\u$CHAR\" # "
grep ^0x"$CHAR" "$TABLE1" | sed -e"s/^.*# //"
else
echo -n "$CURRENT_HEX \"\\u$CHAR\" # "
grep ^0x"$CHAR" "$TABLE2" | sed -e"s/^.*# //"
fi
fi

CURRENT=$((CURRENT + 1))
done
192 changes: 100 additions & 92 deletions src/builtin_table.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,101 +97,109 @@ table_t *load_builtin_safe_table(void)
* Generated from iso8859_1.tbl
*/

static table_row_t builtin_iso8859_1_rows[88] = {
{ .key = 0x00a0, .data = " " },
{ .key = 0x00a1, .data = "!" },
{ .key = 0x00a2, .data = "_cent_" },
{ .key = 0x00a3, .data = "_pound_" },
{ .key = 0x00a4, .data = "$" },
{ .key = 0x00a5, .data = "_yen_" },
{ .key = 0x00a7, .data = "_ss_" },
{ .key = 0x00a8, .data = " " },
{ .key = 0x00a9, .data = "_copy_" },
{ .key = 0x00aa, .data = "_a_" },
{ .key = 0x00ab, .data = "\"" },
{ .key = 0x00ad, .data = "-" },
{ .key = 0x00ae, .data = "_reg_" },
{ .key = 0x00b0, .data = "_deg_" },
{ .key = 0x00b2, .data = "^2" },
{ .key = 0x00b3, .data = "^3" },
{ .key = 0x00b4, .data = "'" },
{ .key = 0x00b5, .data = "u" },
{ .key = 0x00b6, .data = "_pp_" },
{ .key = 0x00b7, .data = "*" },
{ .key = 0x00b8, .data = " " },
{ .key = 0x00b9, .data = "^1" },
{ .key = 0x00ba, .data = "_o_" },
{ .key = 0x00bb, .data = "\"" },
{ .key = 0x00bf, .data = "?" },
{ .key = 0x00c0, .data = "A" },
{ .key = 0x00c1, .data = "A" },
{ .key = 0x00c2, .data = "A" },
{ .key = 0x00c3, .data = "A" },
{ .key = 0x00c4, .data = "A" },
{ .key = 0x00c5, .data = "A" },
{ .key = 0x00c6, .data = "AE" },
{ .key = 0x00c7, .data = "C" },
{ .key = 0x00c8, .data = "E" },
{ .key = 0x00c9, .data = "E" },
{ .key = 0x00ca, .data = "E" },
{ .key = 0x00cb, .data = "E" },
{ .key = 0x00cc, .data = "I" },
{ .key = 0x00cd, .data = "I" },
{ .key = 0x00ce, .data = "I" },
{ .key = 0x00cf, .data = "I" },
{ .key = 0x00d0, .data = "TH" },
{ .key = 0x00d1, .data = "N" },
{ .key = 0x00d2, .data = "O" },
{ .key = 0x00d3, .data = "O" },
{ .key = 0x00d4, .data = "O" },
{ .key = 0x00d5, .data = "O" },
{ .key = 0x00d6, .data = "O" },
{ .key = 0x00d7, .data = "x" },
{ .key = 0x00d8, .data = "O" },
{ .key = 0x00d9, .data = "U" },
{ .key = 0x00da, .data = "U" },
{ .key = 0x00db, .data = "U" },
{ .key = 0x00dc, .data = "U" },
{ .key = 0x00dd, .data = "Y" },
{ .key = 0x00de, .data = "TH" },
{ .key = 0x00df, .data = "ss" },
{ .key = 0x00e0, .data = "a" },
{ .key = 0x00e1, .data = "a" },
{ .key = 0x00e2, .data = "a" },
{ .key = 0x00e3, .data = "a" },
{ .key = 0x00e4, .data = "a" },
{ .key = 0x00e5, .data = "a" },
{ .key = 0x00e6, .data = "ae" },
{ .key = 0x00e7, .data = "c" },
{ .key = 0x00e8, .data = "e" },
{ .key = 0x00e9, .data = "e" },
{ .key = 0x00ea, .data = "e" },
{ .key = 0x00eb, .data = "e" },
{ .key = 0x00ec, .data = "i" },
{ .key = 0x00ed, .data = "i" },
{ .key = 0x00ee, .data = "i" },
{ .key = 0x00ef, .data = "i" },
{ .key = 0x00f0, .data = "th" },
{ .key = 0x00f1, .data = "n" },
{ .key = 0x00f2, .data = "o" },
{ .key = 0x00f3, .data = "o" },
{ .key = 0x00f4, .data = "o" },
{ .key = 0x00f5, .data = "o" },
{ .key = 0x00f6, .data = "o" },
{ .key = 0x00f8, .data = "o" },
{ .key = 0x00f9, .data = "u" },
{ .key = 0x00fa, .data = "u" },
{ .key = 0x00fb, .data = "u" },
{ .key = 0x00fc, .data = "u" },
{ .key = 0x00fd, .data = "y" },
{ .key = 0x00fe, .data = "th" },
{ .key = 0x00ff, .data = "y" },
static table_row_t builtin_iso8859_1_rows[96] = {
{ .key = 0x00a0, .data = "\u00A0" },
{ .key = 0x00a1, .data = "\u00A1" },
{ .key = 0x00a2, .data = "\u00A2" },
{ .key = 0x00a3, .data = "\u00A3" },
{ .key = 0x00a4, .data = "\u00A4" },
{ .key = 0x00a5, .data = "\u00A5" },
{ .key = 0x00a6, .data = "\u00A6" },
{ .key = 0x00a7, .data = "\u00A7" },
{ .key = 0x00a8, .data = "\u00A8" },
{ .key = 0x00a9, .data = "\u00A9" },
{ .key = 0x00aa, .data = "\u00AA" },
{ .key = 0x00ab, .data = "\u00AB" },
{ .key = 0x00ac, .data = "\u00AC" },
{ .key = 0x00ad, .data = "\u00AD" },
{ .key = 0x00ae, .data = "\u00AE" },
{ .key = 0x00af, .data = "\u00AF" },
{ .key = 0x00b0, .data = "\u00B0" },
{ .key = 0x00b1, .data = "\u00B1" },
{ .key = 0x00b2, .data = "\u00B2" },
{ .key = 0x00b3, .data = "\u00B3" },
{ .key = 0x00b4, .data = "\u00B4" },
{ .key = 0x00b5, .data = "\u00B5" },
{ .key = 0x00b6, .data = "\u00B6" },
{ .key = 0x00b7, .data = "\u00B7" },
{ .key = 0x00b8, .data = "\u00B8" },
{ .key = 0x00b9, .data = "\u00B9" },
{ .key = 0x00ba, .data = "\u00BA" },
{ .key = 0x00bb, .data = "\u00BB" },
{ .key = 0x00bc, .data = "\u00BC" },
{ .key = 0x00bd, .data = "\u00BD" },
{ .key = 0x00be, .data = "\u00BE" },
{ .key = 0x00bf, .data = "\u00BF" },
{ .key = 0x00c0, .data = "\u00C0" },
{ .key = 0x00c1, .data = "\u00C1" },
{ .key = 0x00c2, .data = "\u00C2" },
{ .key = 0x00c3, .data = "\u00C3" },
{ .key = 0x00c4, .data = "\u00C4" },
{ .key = 0x00c5, .data = "\u00C5" },
{ .key = 0x00c6, .data = "\u00C6" },
{ .key = 0x00c7, .data = "\u00C7" },
{ .key = 0x00c8, .data = "\u00C8" },
{ .key = 0x00c9, .data = "\u00C9" },
{ .key = 0x00ca, .data = "\u00CA" },
{ .key = 0x00cb, .data = "\u00CB" },
{ .key = 0x00cc, .data = "\u00CC" },
{ .key = 0x00cd, .data = "\u00CD" },
{ .key = 0x00ce, .data = "\u00CE" },
{ .key = 0x00cf, .data = "\u00CF" },
{ .key = 0x00d0, .data = "\u00D0" },
{ .key = 0x00d1, .data = "\u00D1" },
{ .key = 0x00d2, .data = "\u00D2" },
{ .key = 0x00d3, .data = "\u00D3" },
{ .key = 0x00d4, .data = "\u00D4" },
{ .key = 0x00d5, .data = "\u00D5" },
{ .key = 0x00d6, .data = "\u00D6" },
{ .key = 0x00d7, .data = "\u00D7" },
{ .key = 0x00d8, .data = "\u00D8" },
{ .key = 0x00d9, .data = "\u00D9" },
{ .key = 0x00da, .data = "\u00DA" },
{ .key = 0x00db, .data = "\u00DB" },
{ .key = 0x00dc, .data = "\u00DC" },
{ .key = 0x00dd, .data = "\u00DD" },
{ .key = 0x00de, .data = "\u00DE" },
{ .key = 0x00df, .data = "\u00DF" },
{ .key = 0x00e0, .data = "\u00E0" },
{ .key = 0x00e1, .data = "\u00E1" },
{ .key = 0x00e2, .data = "\u00E2" },
{ .key = 0x00e3, .data = "\u00E3" },
{ .key = 0x00e4, .data = "\u00E4" },
{ .key = 0x00e5, .data = "\u00E5" },
{ .key = 0x00e6, .data = "\u00E6" },
{ .key = 0x00e7, .data = "\u00E7" },
{ .key = 0x00e8, .data = "\u00E8" },
{ .key = 0x00e9, .data = "\u00E9" },
{ .key = 0x00ea, .data = "\u00EA" },
{ .key = 0x00eb, .data = "\u00EB" },
{ .key = 0x00ec, .data = "\u00EC" },
{ .key = 0x00ed, .data = "\u00ED" },
{ .key = 0x00ee, .data = "\u00EE" },
{ .key = 0x00ef, .data = "\u00EF" },
{ .key = 0x00f0, .data = "\u00F0" },
{ .key = 0x00f1, .data = "\u00F1" },
{ .key = 0x00f2, .data = "\u00F2" },
{ .key = 0x00f3, .data = "\u00F3" },
{ .key = 0x00f4, .data = "\u00F4" },
{ .key = 0x00f5, .data = "\u00F5" },
{ .key = 0x00f6, .data = "\u00F6" },
{ .key = 0x00f7, .data = "\u00F7" },
{ .key = 0x00f8, .data = "\u00F8" },
{ .key = 0x00f9, .data = "\u00F9" },
{ .key = 0x00fa, .data = "\u00FA" },
{ .key = 0x00fb, .data = "\u00FB" },
{ .key = 0x00fc, .data = "\u00FC" },
{ .key = 0x00fd, .data = "\u00FD" },
{ .key = 0x00fe, .data = "\u00FE" },
{ .key = 0x00ff, .data = "\u00FF" },
};

static table_t builtin_iso8859_1_table = {
.length = 88,
.used = 88,
.max_data_length = 7,
.length = 96,
.used = 96,
.max_data_length = 6,
.max_key = 0x00ff,
.hits = 0,
.misses = 0,
Expand Down
Loading

0 comments on commit e9434d4

Please sign in to comment.