-
Notifications
You must be signed in to change notification settings - Fork 2
/
string-formatting.red
499 lines (441 loc) · 14.1 KB
/
string-formatting.red
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
Red [
File: %string-formatting.red
Purpose: "Red string formatting functions"
Date: "13-Apr-2017"
Version: 0.0.1
Author: "Gregg Irwin"
Notes: {
}
TBD: {
}
]
string-formatting: context [
e.g.: :comment
; Generic support funcs (belong in more general mezzanine libs)
; This is a temp version of a split-at func, hence the different name.
break-at: function [
"Split the series at a position or value, returning the two halves, excluding delim."
series [series!]
delim "Delimiting value, or index if an integer"
/last "Split at the last occurrence of value, from the tail"
/local s
][
reduce either all [integer? delim not last] [
parse series [collect [keep delim skip keep to end]]
][
if string? series [delim: form delim]
if not find/only series delim [
return reduce [copy series copy ""]
]
either last [
reduce [
copy/part series find/only/last series :delim
copy find/only/last/tail series :delim
]
][
; `copy s` is here because `keep to` doesn't collect anything if the
; delim is the first thing in the string.
parse series [collect [keep copy s to delim delim keep to end]]
]
]
]
;>> break-at "" "."
;== ["" ""]
;>> break-at "132" "."
;== ["132" ""]
;>> break-at "132." "."
;== ["132" #"^@"]
change-all: func [
"Change each value in the series by applying a function to it"
series [series!]
fn [any-function!] "Function that takes one arg"
][
forall series [change series fn first series]
series
]
; I've never liked the name of this func, but I'm including it here
; because the behavior is handy for how I'm merging masks currently.
first+: func [
"Return first value in series, and increment the series index."
'word [word! paren!] "Word must be a series."
][
if paren? :word [set/any 'word do :word]
also pick get word 1 set word next get word
]
form-if-char: func [val][either char? val [form val][:val]]
; I have this here because some old format code I'm porting uses it.
; It may all change to `rejoin`, but it gave me a reason to port `join`
; to Red for real and think about object/map support. `Rejoin` doesn't
; work for those. The question, then, is what value there is in a
; uniform interface for copy+extend.
join: func [
"Concatenate values"
a "Coerced to string if not a series, map, or object"
b "Single value or block of values; reduced if a is not an object or map"
][
if all [block? :b not object? :a not map? :a] [b: reduce b]
case [
series? :a [append copy a :b]
map? :a [extend copy a :b]
object? :a [make a :b]
'else [append form :a :b]
]
]
;---------------------------------------------------------------------------
set 'align function [
{Justify the given string to the specified width and direction}
s [any-string!] "The string to justify"
wd [integer!] "The target width, in characters"
/left "Left align the string (default)"
/center "Center align the string"
; {Center justify the string. If the total length of the padding
; is an odd number of characters, the extra character will be on
; the right.}
/right "Right align the string"
/with "Fill with something other than space"
; {Allows you to specify filler other than space. If you specify a
; string more than 1 character in length, it will be repeated as
; many times as necessary.}
filler [any-string! char!] "The character, or string, to use as filler"
][
if 0 >= pad-len: (wd - length? s) [return s] ; Never truncate
filler: form any [filler space]
result: head insert/dup make string! wd filler (wd / length? filler)
; If they gave us a multi-char filler, and it isn't evenly multiplied
; into the desired width, we have to add some extra chars at the end
; to make up for the difference.
if wd > length? result [
append result copy/part filler (wd - length? result)
]
pos: either center [
add 1 to integer! divide pad-len 2
][
either right [add 1 pad-len] [1]
]
head change/part at result pos s length? s
]
e.g. [
align "a" 10
align/center "a" 10
align/right "a" 10
align/with "a" 10 #"*"
align/center/with "a" 10 #"*"
align/right/with "a" 10 #"*"
align/with "a" 10 "._"
align/center/with "a" 10 "._"
align/right/with "a" 10 "._"
align/with "a" 10 "+________+"
align/center/with "a" 10 "+________+"
align/right/with "a" 10 "+________+"
template: "+________+"
align/with "abcd" length? template template
align/center/with "abcd" length? template template
align/right/with "abcd" length? template template
]
fill: function [
"Fill part of a template string with a formed value"
str [any-string!] "Template string"
align [word!] "[left center right]"
val "(formed) Value to insert in template string"
;/trunc "Truncate val if longer than str" ;?? make ellipsis last char if truncated?
][
str: copy str ; Don't modify template string
;if not any-string? val [val: form val] ; Prep the value
val: form val ; Prep val; always copy as we may return it
diff: (length? str) - (length? val) ; How much longer is the template than the value
if not positive? diff [return val] ; Never truncate the formed value
pos: switch/default align [
left [1]
center [add 1 to integer! divide diff 2]
right [add 1 diff]
][1]
head change at str pos val
]
e.g. [
template: "+________+"
fill template 'left ""
fill template 'right ""
fill template 'center ""
fill template 'left "abc"
fill template 'right "abc"
fill template 'center "abc"
fill template 'left "abcd"
fill template 'right "abcd"
fill template 'center "abcd"
fill template 'left "abcdefghi"
fill template 'right "abcdefghi"
fill template 'center "abcdefghi"
fill template 'left "abcdefghij"
fill template 'right "abcdefghij"
fill template 'center "abcdefghij"
fill template 'left "abcdefghijk"
fill template 'right "abcdefghijk"
fill template 'center "abcdefghijk"
]
pad-aligned: func [
"Wrapper for `pad` to ease refinement propagation"
str [string!] align [word!] wd [integer!] ch [char!]
][
switch align [
left [pad/with str wd ch]
right [pad/with/left str wd ch]
]
]
;---------------------------------------------------------------------------
format-string: func [
value [any-string!]
fmt [word! string! block!] "Named or custom format"
/local fmts
][
either word? fmt [
; Named formats. Can't use AA/aa/Aa because switch isn't case sensitive.
; Need to use something else to do that here.
switch fmt [
;general [value]
upper uppercase all-caps [uppercase value]
lower lowercase [lowercase value]
cap capitalize [uppercase/part lowercase value 1]
;proper [uppercase/part lowercase value 1]
;camel
; throw error - unknown named format specified?
;case else [either any-block? value [reform value] [form value]]
]
][
; custom format
either block? fmt [
context [
align=: wd=: fill=: rules:
=align: =wd: =fill: mod: res:
none
align=: [opt 'align set =align ['left | 'center | 'right]] ; opt 'align set =align
; 'size or 'pad keywods for width?
wd=: [opt ['width | 'wd] set =wd integer! (if negative? =wd [=wd: abs =wd align: 'right])]
fill=: [opt ['filler | 'fill opt 'with] set =fill [char! | string!]]
rules: [
(
=align: 'left
=fill: #" "
=wd: 0
)
; Case change rules have to come first, before alignment rules.
; Will that confuse people?
opt [
['upper | 'uppercase] (uppercase value)
| ['lower | 'lowercase] (lowercase value)
| ['cap | 'capitalize] (uppercase/part lowercase value 1)
]
any [align= | wd= | fill=] (
res: do reduce [
to path! compose [justify (=align) with] value =wd =fill
]
)
]
either parse fmt rules [res] [#ERR]
]
][
; TBD - string format?
; @&<>! ; specials
; aa - lower
; AA - upper
; Aa - capitalize
; AaAa - camel
; left center right < ^ >
; ... ; show ellipsis if truncated
; width (left justify)
; negative width (right justify)
; What do we do for an as-is format? i.e. strings in a block that
; they don't want formatted?
]
]
]
] ; end of string-formatting context
;-------------------------------------------------------------------------------
;
; Capitalization (still very much experimental and incomplete)
;
; http://en.wikipedia.org/wiki/Capitalization
; http://individed.com/code/to-title-case/
; http://individed.com/code/to-title-case/tests.html
; http://daringfireball.net/2008/08/title_case_update
; http://www.heikniemi.net/hardcoded/2004/10/propercase-for-c/
; http://blogs.msdn.com/b/michkap/archive/2005/03/04/384927.aspx
; Title Case - the first letter of each word is capitalized, the rest are lower case.
; In some cases short articles, prepositions, and conjunctions are not capitalized.
;
; Proper Case - Used for proper nouns, the first letter of each word is capitalized.
;
; CamelCase - First letter of each word capitalized, spaces and punctuation removed.
; "Q&A" "R&D" "AT&T"
; http://www.sti.nasa.gov/sp7084/ch4.html
capitalization-ctx: context [
ch-whitespace=: charset " ^/^-"
ch-digit=: charset "1234567890"
;ch-hexdigit=: charset "1234567890abcdefABCDEF"
ch-lower=: charset [#"a" - #"z"]
ch-upper=: charset [#"A" - #"Z"]
ch-alpha=: union ch-lower= ch-upper=
ch-alphanum=: union ch-alpha= ch-digit=
;ch-ascii=: charset [#"^(00)" - #"^(7F)"]
;ch-low-ascii=: charset [#"^(00)" - #"^(1F)"]
;ch-high-ascii=: charset [#"^(80)" - #"^(FF)"]
ch-word=: ch-alphanum=
;ch-word=: union ch-alphanum= charset "_" ;?
ch-non-word=: complement ch-word=
;auxilliaries: ["is" "am" "are" "was" "be" "has" "had" "do" "did"]
articles: ["the" "a" "an"]
prepositions: ["of" "to" "in" "for" "with" "on"]
conjunctions: ["for" "and" "nor" "but" "or" "yet" "so"] ; "either" "not" "neither" "both" "whether"
foreign-particles: [
"von" ; German - don't cap
"van" "de" "der" "ter" ; Dutch - "Van der" only cap first, cap only if no given name
]
;medial-prefixes: ["Mc" "Mc'" "Mac" "Mac'" "O'" "M'"]
medial-prefixes: ["Mc"] ; "Mac"?
word=: [
copy =word some ch-word=
;copy =word to ch-non-word=
]
def-dict: compose/deep [
lower [(union union articles prepositions conjunctions)]
upper [
"NW" "SW" "NE" "SE" "SSW" "SSE" "NNW" "NNE"
;"Q&A" "R&D" "AT&T" ; & is a break char right now, so we need to consider how best to do this.
"UK" "USA"
]
fixed ["MHz"]
]
; U.S.A. is a tricky one, because the dot is seen as a break and the 'a is
; seen as an article, which is then converted to lowercase. So we end up
; parsing it as three separate, single-letter words with breaks.
; uppercase/part 1
;
; last-break: :this-break
; this-break: Look for word break
; If last-word (last-break to this-break) is
; acronym [no change]
; small word [lowercase it]
; starts with o' and is > 3 chars [uppercase/part 3]
; ? starts with mc [uppercase/part 1 at str 3 uppercase/part 1]
; ? starts with mac [uppercase/part 1 at str 4 uppercase/part 1]
; [van von der ] [lowercase it]
; ? should
set 'capitalize func [
string [any-string!]
/name "Don't use dict for special processing"
/address "Don't use dict for special processing"
/camel "Non-word chars are removed"
;/break-at non-word-chars [bitset!]
/with dict [block!] {[lower ["a" "an" "the"] upper ["AT&T"] fixed ["MacLeod"]}
/show "Show words and their offsets as they are found"
] [
if empty? string [return string]
dict: any [dict def-dict]
; cap: func [
; string [any-string!]
; ] [
; uppercase/part lowercase string 1
; ]
;
; cap-it?: func [
; string [any-string!]
; ] [
; if any [name address] [return true]
; ]
;
; uppercase?: func [
; "Returns true if the string is all caps."
; string [any-string!]
; ][
; parse string [some [ch-upper=]]
; ]
;
; lowercase?: func [
; "Returns true if the string is all lowercase chars."
; string [any-string!]
; ][
; parse string [some [ch-lower=]]
; ]
w-start: none
w-end: none
lower-wd: does [
change/part w-start lowercase/part w-start w-end w-end
]
upper-wd: does [
change/part w-start uppercase/part w-start w-end w-end
]
cap-wd: does [
change/part w-start uppercase/part w-start 1 w-end
]
chg-wd: func [new-wd [string!]] [
change/part w-start new-wd w-end
]
parse string [
some [
w-start: word= w-end: (
if show [print [=word index? w-start index? w-end]]
case [
find dict/fixed =word [
chg-wd pick dict/fixed index? find dict/fixed =word
]
any [
find foreign-particles =word
find dict/lower =word
] [lower-wd]
; Need to determine what to do about & breaking words, which these may be.
find dict/upper =word [upper-wd]
; Medial prefixes - make this dynamic for different prefixes
"Mc" = copy/part =word 2 [
lower-wd
cap-wd
change/part next next w-start w-end uppercase/part next next w-start 1
]
; End of a contraction of possesive
all [
find ["T" "S"] =word
#"'" = attempt [first back w-start]
] [lower-wd]
; Standard word to capitalize
'else [lower-wd cap-wd]
]
) :w-end
| skip
]
]
; This does NOT strip leading digits from the word.
if camel [
parse string [
some [
mark: ch-non-word= (mark: remove mark) :mark
| skip
]
]
]
; Always capitalize the first letter.
; Should fixed words override this?
uppercase/part string 1
]
]
comment {
capitalize-tests: [
"ALONZO-MEDRANO"
"URDANETA-ROSARIO"
"MIGUEL" "BENAVIDESAQUILLILA" "5559 Gatlin Av # G" "Orlando"
"JOSE" "FELICIANODELGADO" "114 Sandy Point Way" "Clermont"
"YOANDRIS" "ALVAREZ GUTIERREZ" "70 E 55TH ST"
"11120 SW 196TH ST 402 B"
"mccray"
"mckinzie"
"Mcleod"
"Macleod"
"o'Brien"
"O'reilly"
"von helsing"
"van der meer"
"500mhz"
"500 mhz"
"u.s.a"
]
foreach str capitalize-tests [
print [mold str tab mold capitalize copy str]
]
}