Skip to content

Commit 471fade

Browse files
authored
Merge pull request #175 from ghaith/wstring
WSTRING support
2 parents 745c53f + 556f1a9 commit 471fade

14 files changed

+311
-25
lines changed

book/src/datatypes.md

+33-4
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,42 @@
11
# Datatypes
22

3+
## Strings
4+
### STRING
5+
rusty treats `STRING`s as byte-arrays storing UTF-8 character bytes with a Null-terminator (0-byte) at the end.
6+
So a String of size n requres n+1 bytes to account for the Null-terminator.
7+
A `STRING` literal is surrounded by single-ticks `'`.
8+
9+
A String has a well defined length which can be defined similar to the array-syntax. A String-variable
10+
`myVariable: STRING[20]` declares a byte array of length 21, to store 20 utf8 character bytes. When
11+
declaring a `STRING`, the length-attribute is optional. The default length is 80.
12+
13+
Examples
14+
- `s1 : STRING;` - declares a String of length 80
15+
- `s2 : STRING[20];` - declares a String of length 20
16+
- `s3 : STRING := 'Hello World';` - declares and initializes a String of length 80, and initializes it with the utf8 characters and a null-terminator at the end
17+
- `s4 : STRING[55] := 'Foo Baz';` - declares and initializes a String of length 55 and initializes it with the utf8 characters and a null-terminator at the end.
18+
19+
### WSTRING (Wide Strings)
20+
rusty treats `WSTRING`s as byte-arrays storing UTF-16 character bytes with two Null-terminator bytes at the end. The bytes are stored in Little Endian encoding. A Wide-String of size n requres 2 * (n+1) bytes to account for the 2 byes per utf16 character and the Null-terminators. A `WSTRING` literal is surrounded by doubly-ticks `"`.
21+
22+
A `WSTRING` has a well defined length which can be defined similar to the array-syntax. A `WSTRING`-variable
23+
`myVariable: WSTRING[20]` declares a byte array of length 42, to store 20 utf16 character bytes. When
24+
declaring a `WSTRING`, the length-attribute is optional. The default length is 80.
25+
26+
Examples
27+
- `ws1 : WSTRING;` - declares a Wide-String of length 80
28+
- `ws2 : WSTRING[20];` - declares a Wide-String of length 20
29+
- `ws3 : WSTRING := "Hello World";` - declares and initializes a Wide-String of length 80, and initializes it with the utf16 characters and a utf16-null-terminator at the end
30+
- `ws4 : WSTRING[55] := "Foo Baz";` - declares and initializes a Wide-String of length 55 and initializes it with the utf8 characters and a utf16-null-terminator at the end.
31+
332
## Date and Time
433
### DATE
534
The `DATE` datatype is used to represent a Date in the Gregorian Calendar. Such a value is
635
stored as an i64 with a precision in milliseconds and denotes the number of milliseconds
736
that have elapsed since January 1, 1970 UTC not counting leap seconds. DATE literals start
837
with `DATE#` or `D#` followed by a date in the format of `yyyy-mm-dd`.
938

10-
Example literals
39+
Examples
1140
- `d1 : DATE := DATE#2021-05-02;`
1241
- `d2 : DATE := DATE#1-12-24;`
1342
- `d3 : DATE := D#2000-1-1;`
@@ -21,7 +50,7 @@ format of `yyyy-mm-dd-hh:mm:ss`.
2150

2251
Note that only the seconds-segment can have a fraction denoting the milliseconds.
2352

24-
Example literals
53+
Examples
2554
- `d1 : DATE_AND_TIME := DATE_AND_TIME#2021-05-02-14:20:10.25;`
2655
- `d2 : DATE_AND_TIME := DATE_AND_TIME#1-12-24-00:00:1;`
2756
- `d3 : DATE_AND_TIME := DT#1999-12-31-23:59:59.999;`
@@ -36,7 +65,7 @@ format of `hh:mm:ss`.
3665

3766
Note that only the seconeds-segment can have a fraction denoting the milliseconds.
3867

39-
Example literals
68+
Examples
4069
- `t1 : TIME_OF_DAY := TIME_OF_DAY#14:20:10.25;`
4170
- `t2 : TIME_OF_DAY := TIME_OF_DY#0:00:1;`
4271
- `t3 : TIME_OF_DAY := TOD#23:59:59.999;`
@@ -55,7 +84,7 @@ TIME literals start with `TIME#` or `T#` followed by the `TIME` segements. Suppo
5584

5685
Note that only the last segment of a `TIME` literal can have a fraction.
5786

58-
Example literals
87+
Examples
5988
- `t1 : TIME := TIME#2d4h6m8s10ms;`
6089
- `t2 : TIME := T#2d4.2h;`
6190
- `t3 : TIME := T#-10s4ms16ns;`

src/ast.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,7 @@ pub enum Statement {
461461
},
462462
LiteralString {
463463
value: String,
464+
is_wide: bool,
464465
location: SourceRange,
465466
},
466467
LiteralArray {
@@ -623,9 +624,10 @@ impl Debug for Statement {
623624
Statement::LiteralBool { value, .. } => {
624625
f.debug_struct("LiteralBool").field("value", value).finish()
625626
}
626-
Statement::LiteralString { value, .. } => f
627+
Statement::LiteralString { value, is_wide, .. } => f
627628
.debug_struct("LiteralString")
628629
.field("value", value)
630+
.field("is_wide", is_wide)
629631
.finish(),
630632
Statement::LiteralArray { elements, .. } => f
631633
.debug_struct("LiteralArray")

src/codegen/generators/data_type_generator.rs

+5-4
Original file line numberDiff line numberDiff line change
@@ -128,10 +128,11 @@ fn create_type<'ink>(
128128
DataTypeInformation::Float { size, .. } => {
129129
get_llvm_float_type(llvm.context, *size, name).map(|it| it.into())
130130
}
131-
DataTypeInformation::String { size } => {
132-
let gen_type = llvm.context.i8_type().array_type(*size).into();
133-
Ok(gen_type)
134-
}
131+
DataTypeInformation::String { size, encoding } => Ok(llvm
132+
.context
133+
.i8_type()
134+
.array_type(*size * encoding.get_bytes_per_char())
135+
.into()),
135136
DataTypeInformation::SubRange {
136137
referenced_type, ..
137138
} => {

src/codegen/generators/expression_generator.rs

+7-1
Original file line numberDiff line numberDiff line change
@@ -1159,7 +1159,13 @@ impl<'a, 'b> ExpressionCodeGenerator<'a, 'b> {
11591159
self.llvm
11601160
.create_const_real(self.index, &self.get_type_context(), value)
11611161
}
1162-
Statement::LiteralString { value, .. } => self.llvm.create_const_string(value.as_str()),
1162+
Statement::LiteralString { value, is_wide, .. } => {
1163+
if *is_wide {
1164+
self.llvm.create_const_utf16_string(value.as_str())
1165+
} else {
1166+
self.llvm.create_const_utf8_string(value.as_str())
1167+
}
1168+
}
11631169
Statement::LiteralArray { elements, location } => {
11641170
self.generate_literal_array(elements, location)
11651171
}

src/codegen/generators/llvm.rs

+34-3
Original file line numberDiff line numberDiff line change
@@ -221,14 +221,45 @@ impl<'a> Llvm<'a> {
221221
}
222222
}
223223

224-
/// create a constant string-value with the given value
224+
/// create a constant utf8 string-value with the given value
225225
///
226226
/// - `value` the value of the constant string value
227-
pub fn create_const_string(&self, value: &str) -> Result<TypeAndValue<'a>, CompileError> {
227+
pub fn create_const_utf8_string(&self, value: &str) -> Result<TypeAndValue<'a>, CompileError> {
228228
self.create_llvm_const_vec_string(value.as_bytes())
229229
}
230230

231-
/// create a constant string-value with the given value
231+
/// create a constant utf16 string-value with the given value
232+
///
233+
/// - `value` the value of the constant string value
234+
pub fn create_const_utf16_string(&self, value: &str) -> Result<TypeAndValue<'a>, CompileError> {
235+
let mut utf16_chars: Vec<u16> = value.encode_utf16().collect();
236+
//it only contains a single NUL-terminator-byte so we add a second one
237+
utf16_chars.push(0);
238+
self.create_llvm_const_utf16_vec_string(utf16_chars.as_slice())
239+
}
240+
241+
/// create a constant utf16 string-value with the given value
242+
///
243+
/// - `value` the value of the constant string value
244+
pub fn create_llvm_const_utf16_vec_string(
245+
&self,
246+
value: &[u16],
247+
) -> Result<TypeAndValue<'a>, CompileError> {
248+
let mut bytes = Vec::with_capacity(value.len() * 2);
249+
value.iter().for_each(|it| {
250+
let ordered_bytes = it.to_le_bytes(); //todo make this a compiler-setting
251+
bytes.push(ordered_bytes[0]);
252+
bytes.push(ordered_bytes[1]);
253+
});
254+
255+
let exp_value = self.context.const_string(bytes.as_slice(), false);
256+
Ok((
257+
typesystem::new_wide_string_information(value.len() as u32),
258+
BasicValueEnum::VectorValue(exp_value),
259+
))
260+
}
261+
262+
/// create a constant utf8 string-value with the given value
232263
///
233264
/// - `value` the value of the constant string value
234265
pub fn create_llvm_const_vec_string(

src/codegen/tests/code_gen_tests.rs

+20-5
Original file line numberDiff line numberDiff line change
@@ -498,23 +498,27 @@ fn program_with_string_assignment() {
498498
r#"PROGRAM prg
499499
VAR
500500
y : STRING;
501+
z : WSTRING;
501502
END_VAR
502503
y := 'im a genius';
504+
z := "im a utf16 genius";
503505
END_PROGRAM
504506
"#
505507
);
506508

507509
let expected = r#"; ModuleID = 'main'
508510
source_filename = "main"
509511
510-
%prg_interface = type { [81 x i8] }
512+
%prg_interface = type { [81 x i8], [162 x i8] }
511513
512514
@prg_instance = global %prg_interface zeroinitializer
513515
514516
define void @prg(%prg_interface* %0) {
515517
entry:
516518
%y = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 0
519+
%z = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 1
517520
store [12 x i8] c"im a genius\00", [81 x i8]* %y, align 1
521+
store [36 x i8] c"i\00m\00 \00a\00 \00u\00t\00f\001\006\00 \00g\00e\00n\00i\00u\00s\00\00\00", [162 x i8]* %z, align 1
518522
ret void
519523
}
520524
"#;
@@ -527,31 +531,36 @@ fn program_with_string_type_assignment() {
527531
let result = codegen!(
528532
r#"
529533
TYPE MyString: STRING[99] := 'abc'; END_TYPE
534+
TYPE MyWString: WSTRING[99] := "abc"; END_TYPE
530535
531536
PROGRAM prg
532537
VAR
533538
y : STRING;
534539
z : MyString;
540+
zz : MyWString;
535541
END_VAR
536542
y := 'im a genius';
537543
z := 'im also a genius';
544+
zz := "im also a genius";
538545
END_PROGRAM
539546
"#
540547
);
541548

542549
let expected = r#"; ModuleID = 'main'
543550
source_filename = "main"
544551
545-
%prg_interface = type { [81 x i8], [100 x i8] }
552+
%prg_interface = type { [81 x i8], [100 x i8], [200 x i8] }
546553
547-
@prg_instance = global %prg_interface { [81 x i8] zeroinitializer, [4 x i8] c"abc\00" }
554+
@prg_instance = global %prg_interface { [81 x i8] zeroinitializer, [4 x i8] c"abc\00", [8 x i8] c"a\00b\00c\00\00\00" }
548555
549556
define void @prg(%prg_interface* %0) {
550557
entry:
551558
%y = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 0
552559
%z = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 1
560+
%zz = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 2
553561
store [12 x i8] c"im a genius\00", [81 x i8]* %y, align 1
554562
store [17 x i8] c"im also a genius\00", [100 x i8]* %z, align 1
563+
store [34 x i8] c"i\00m\00 \00a\00l\00s\00o\00 \00a\00 \00g\00e\00n\00i\00u\00s\00\00\00", [200 x i8]* %zz, align 1
555564
ret void
556565
}
557566
"#;
@@ -566,24 +575,30 @@ fn variable_length_strings_can_be_created() {
566575
VAR
567576
y : STRING[15];
568577
z : STRING[3] := 'xyz';
578+
wy : WSTRING[15];
579+
wz : WSTRING[3] := "xyz";
569580
END_VAR
570581
y := 'im a genius';
582+
wy := "im a genius";
571583
END_PROGRAM
572584
"#
573585
);
574586

575587
let expected = r#"; ModuleID = 'main'
576588
source_filename = "main"
577589
578-
%prg_interface = type { [16 x i8], [4 x i8] }
590+
%prg_interface = type { [16 x i8], [4 x i8], [32 x i8], [8 x i8] }
579591
580-
@prg_instance = global %prg_interface { [16 x i8] zeroinitializer, [4 x i8] c"xyz\00" }
592+
@prg_instance = global %prg_interface { [16 x i8] zeroinitializer, [4 x i8] c"xyz\00", [32 x i8] zeroinitializer, [8 x i8] c"x\00y\00z\00\00\00" }
581593
582594
define void @prg(%prg_interface* %0) {
583595
entry:
584596
%y = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 0
585597
%z = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 1
598+
%wy = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 2
599+
%wz = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 3
586600
store [12 x i8] c"im a genius\00", [16 x i8]* %y, align 1
601+
store [24 x i8] c"i\00m\00 \00a\00 \00g\00e\00n\00i\00u\00s\00\00\00", [32 x i8]* %wy, align 1
587602
ret void
588603
}
589604
"#;

src/index/visitor.rs

+14-2
Original file line numberDiff line numberDiff line change
@@ -280,13 +280,25 @@ fn visit_data_type(index: &mut Index, type_declatation: &UserTypeDeclaration) {
280280
information,
281281
)
282282
}
283-
DataType::StringType { name, size, .. } => {
283+
DataType::StringType {
284+
name,
285+
size,
286+
is_wide,
287+
..
288+
} => {
284289
let size = if let Some(statement) = size {
285290
evaluate_constant_int(&statement).unwrap() as u32
286291
} else {
287292
crate::typesystem::DEFAULT_STRING_LEN // DEFAULT STRING LEN
288293
} + 1;
289-
let information = DataTypeInformation::String { size };
294+
295+
let encoding = if *is_wide {
296+
StringEncoding::Utf16
297+
} else {
298+
StringEncoding::Utf8
299+
};
300+
301+
let information = DataTypeInformation::String { size, encoding };
290302
index.register_type(
291303
name.as_ref().unwrap(),
292304
type_declatation.initializer.clone(),

src/lexer.rs

+6
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,9 @@ pub enum Token {
272272
#[token("STRING")]
273273
KeywordString,
274274

275+
#[token("WSTRING")]
276+
KeywordWideString,
277+
275278
#[token("OF")]
276279
KeywordOf,
277280

@@ -356,6 +359,9 @@ pub enum Token {
356359
#[regex("'((\\$.)|[^$'])*'")]
357360
LiteralString,
358361

362+
#[regex("\"((\\$.)|[^$\"])*\"")]
363+
LiteralWideString,
364+
359365
#[regex(r"[ \t\n\f]+", logos::skip)]
360366
End,
361367
}

src/lexer/tests/lexer_tests.rs

+23
Original file line numberDiff line numberDiff line change
@@ -505,3 +505,26 @@ fn string_parsing() {
505505
assert_eq!("'AB$''", lexer.slice());
506506
lexer.advance();
507507
}
508+
509+
#[test]
510+
fn wide_string_parsing() {
511+
let mut lexer = lex(r#"
512+
WSTRING
513+
"AB C"
514+
"AB$$"
515+
"AB$""
516+
"#);
517+
518+
assert_eq!(lexer.token, KeywordWideString);
519+
assert_eq!("WSTRING", lexer.slice());
520+
lexer.advance();
521+
assert_eq!(lexer.token, LiteralWideString);
522+
assert_eq!(r#""AB C""#, lexer.slice());
523+
lexer.advance();
524+
assert_eq!(lexer.token, LiteralWideString);
525+
assert_eq!(r#""AB$$""#, lexer.slice());
526+
lexer.advance();
527+
assert_eq!(lexer.token, LiteralWideString);
528+
assert_eq!(r#""AB$"""#, lexer.slice());
529+
lexer.advance();
530+
}

src/parser.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,8 @@ fn parse_data_type_definition(
356356
},
357357
None,
358358
))
359-
} else if lexer.token == KeywordString {
359+
} else if lexer.token == KeywordString || lexer.token == KeywordWideString {
360+
let is_wide = lexer.token == KeywordWideString;
360361
lexer.advance();
361362
let size = if allow(KeywordSquareParensOpen, lexer) {
362363
let size_statement = parse_expression(lexer)?;
@@ -379,7 +380,7 @@ fn parse_data_type_definition(
379380
DataTypeDeclaration::DataTypeDefinition {
380381
data_type: DataType::StringType {
381382
name,
382-
is_wide: false,
383+
is_wide,
383384
size,
384385
},
385386
},

src/parser/expressions_parser.rs

+4-2
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,8 @@ fn parse_leaf_expression(lexer: &mut RustyLexer) -> Result<Statement, String> {
217217
LiteralTimeOfDay => parse_literal_time_of_day(lexer),
218218
LiteralTime => parse_literal_time(lexer),
219219
LiteralDateAndTime => parse_literal_date_and_time(lexer),
220-
LiteralString => parse_literal_string(lexer),
220+
LiteralString => parse_literal_string(lexer, false),
221+
LiteralWideString => parse_literal_string(lexer, true),
221222
LiteralTrue => parse_bool_literal(lexer, true),
222223
LiteralFalse => parse_bool_literal(lexer, false),
223224
KeywordSquareParensOpen => parse_array_literal(lexer),
@@ -521,11 +522,12 @@ fn trim_quotes(quoted_string: &str) -> String {
521522
quoted_string[1..quoted_string.len() - 1].to_string()
522523
}
523524

524-
fn parse_literal_string(lexer: &mut RustyLexer) -> Result<Statement, String> {
525+
fn parse_literal_string(lexer: &mut RustyLexer, is_wide: bool) -> Result<Statement, String> {
525526
let result = lexer.slice();
526527
let location = lexer.location();
527528
let string_literal = Ok(Statement::LiteralString {
528529
value: trim_quotes(result),
530+
is_wide,
529531
location,
530532
});
531533
lexer.advance();

0 commit comments

Comments
 (0)