Skip to content

Commit

Permalink
use unicode character categories to classify identifier characters
Browse files Browse the repository at this point in the history
fixes #6797, fixes #5936
  • Loading branch information
JeffBezanson committed May 13, 2014
1 parent cc7723b commit e039950
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 13 deletions.
64 changes: 61 additions & 3 deletions src/flisp/julia_extensions.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,67 @@ value_t fl_skipws(value_t *args, u_int32_t nargs)
return skipped;
}

static int is_wc_cat_id_start(uint32_t wc, utf8proc_propval_t cat)
{
return (cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LL ||
cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LM ||
cat == UTF8PROC_CATEGORY_LO || cat == UTF8PROC_CATEGORY_NL ||
// allow currency symbols
cat == UTF8PROC_CATEGORY_SC ||
// allow all latin-1 characters except math symbols and quotes
(wc <= 0xff && cat != UTF8PROC_CATEGORY_SM &&
cat != UTF8PROC_CATEGORY_PF && cat != UTF8PROC_CATEGORY_PI) ||
// Other_ID_Start
wc == 0x2118 || wc == 0x212E || (wc >= 0x309B && wc <= 0x309C));
}

static int jl_id_start_char(uint32_t wc)
{
if ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || wc == '_')
return 1;
if (wc < 0xA1 || wc > 0x10ffff)
return 0;
const utf8proc_property_t *prop = utf8proc_get_property(wc);
return is_wc_cat_id_start(wc, prop->category);
}

static int jl_id_char(uint32_t wc)
{
return ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') ||
(wc >= '0' && wc <= '9') || (wc >= 0xA1) ||
wc == '!' || wc == '_');
if ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || wc == '_' ||
(wc >= '0' && wc <= '9') || wc == '!')
return 1;
if (wc < 0xA1 || wc > 0x10ffff)
return 0;
const utf8proc_property_t *prop = utf8proc_get_property(wc);
utf8proc_propval_t cat = prop->category;
if (is_wc_cat_id_start(wc, cat)) return 1;
if (cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_MC ||
cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_PC ||
cat == UTF8PROC_CATEGORY_SK ||
// primes
(wc >= 0x2032 && wc <= 0x2034) ||
// Other_ID_Continue
wc == 0x0387 || wc == 0x19da || (wc >= 0x1369 && wc <= 0x1371))
return 1;
return 0;
}

value_t fl_julia_identifier_char(value_t *args, u_int32_t nargs)
{
argcount("identifier-char?", nargs, 1);
if (!iscprim(args[0]) || ((cprim_t*)ptr(args[0]))->type != wchartype)
type_error("identifier-char?", "wchar", args[0]);
uint32_t wc = *(uint32_t*)cp_data((cprim_t*)ptr(args[0]));
return jl_id_char(wc);
}

value_t fl_julia_identifier_start_char(value_t *args, u_int32_t nargs)
{
argcount("identifier-start-char?", nargs, 1);
if (!iscprim(args[0]) || ((cprim_t*)ptr(args[0]))->type != wchartype)
type_error("identifier-start-char?", "wchar", args[0]);
uint32_t wc = *(uint32_t*)cp_data((cprim_t*)ptr(args[0]));
return jl_id_start_char(wc);
}

// return NFC-normalized UTF8-encoded version of s
Expand Down Expand Up @@ -105,6 +161,8 @@ value_t fl_accum_julia_symbol(value_t *args, u_int32_t nargs)
static builtinspec_t julia_flisp_func_info[] = {
{ "skip-ws", fl_skipws },
{ "accum-julia-symbol", fl_accum_julia_symbol },
{ "identifier-char?", fl_julia_identifier_char },
{ "identifier-start-char?", fl_julia_identifier_start_char },
{ NULL, NULL }
};

Expand Down
12 changes: 2 additions & 10 deletions src/julia-parser.scm
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,6 @@
(let ((chrs (string->list "()[]{},;\"`@")))
(lambda (c) (memv c chrs))))
(define (newline? c) (eqv? c #\newline))
(define (identifier-char? c) (or (and (char>=? c #\A)
(char<=? c #\Z))
(and (char>=? c #\a)
(char<=? c #\z))
(and (char>=? c #\0)
(char<=? c #\9))
(char>=? c #\uA1)
(eqv? c #\_)))
;; characters that can be in an operator
(define (opchar? c) (and (char? c) (string.find op-chars c)))
;; characters that can follow . in an operator
Expand Down Expand Up @@ -418,7 +410,7 @@

((opchar? c) (read-operator port (read-char port)))

((identifier-char? c) (accum-julia-symbol c port))
((identifier-start-char? c) (accum-julia-symbol c port))

(else (error (string "invalid character \"" (read-char port) "\""))))))

Expand Down Expand Up @@ -1523,7 +1515,7 @@
(define (parse-interpolate s)
(let* ((p (ts:port s))
(c (peek-char p)))
(cond ((identifier-char? c)
(cond ((identifier-start-char? c)
(parse-atom s))
((eqv? c #\()
(read-char p)
Expand Down

0 comments on commit e039950

Please sign in to comment.