PLC-lang · ghaith · May 19, 2021 · May 19, 2021
diff --git a/book/src/datatypes.md b/book/src/datatypes.md
@@ -1,13 +1,42 @@
 # Datatypes
 
+## Strings
+### STRING
+rusty treats `STRING`s as byte-arrays storing UTF-8 character bytes with a Null-terminator (0-byte) at the end. 
+So a String of size n requres n+1 bytes to account for the Null-terminator.
+A `STRING` literal is surrounded by single-ticks `'`.
+
+A String has a well defined length which can be defined similar to the array-syntax. A String-variable 
+`myVariable: STRING[20]` declares a byte array of length 21, to store 20 utf8 character bytes. When 
+declaring a `STRING`, the length-attribute is optional. The default length is 80.
+
+Examples
+- `s1 : STRING;` - declares a String of length 80
+- `s2 : STRING[20];` - declares a String of length 20
+- `s3 : STRING := 'Hello World';` - declares and initializes a String of length 80, and initializes it with the utf8 characters and a null-terminator at the end
+- `s4 : STRING[55] := 'Foo Baz';` - declares and initializes a String of length 55 and initializes it with the utf8 characters and a null-terminator at the end.
+
+### WSTRING (Wide Strings)
+rusty treats `WSTRING`s as byte-arrays storing UTF-16 character bytes with two Null-terminator bytes at the end. The bytes are stored in Little Endian encoding. A Wide-String of size n requres 2 * (n+1) bytes to account for the 2 byes per utf16 character and the Null-terminators. A `WSTRING` literal is surrounded by doubly-ticks `"`.
+
+A `WSTRING` has a well defined length which can be defined similar to the array-syntax. A `WSTRING`-variable 
+`myVariable: WSTRING[20]` declares a byte array of length 42, to store 20 utf16 character bytes. When 
+declaring a `WSTRING`, the length-attribute is optional. The default length is 80.
+
+Examples
+- `ws1 : WSTRING;` - declares a Wide-String of length 80
+- `ws2 : WSTRING[20];` - declares a Wide-String of length 20
+- `ws3 : WSTRING := "Hello World";` - declares and initializes a Wide-String of length 80, and initializes it with the utf16 characters and a utf16-null-terminator at the end
+- `ws4 : WSTRING[55] := "Foo Baz";` - declares and initializes a Wide-String of length 55 and initializes it with the utf8 characters and a utf16-null-terminator at the end.
+
 ## Date and Time
 ### DATE
 The `DATE` datatype is used to represent a Date in the Gregorian Calendar. Such a value is 
 stored as an i64 with a precision in milliseconds and denotes the number of milliseconds 
 that have elapsed since January 1, 1970 UTC not counting leap seconds. DATE literals start 
 with `DATE#` or `D#` followed by a date in the format of `yyyy-mm-dd`.
 
-Example literals
+Examples
 - `d1 : DATE := DATE#2021-05-02;`
 - `d2 : DATE := DATE#1-12-24;`
 - `d3 : DATE := D#2000-1-1;`
@@ -21,7 +50,7 @@ format of `yyyy-mm-dd-hh:mm:ss`.
 
 Note that only the seconds-segment can have a fraction denoting the milliseconds.
 
-Example literals
+Examples
 - `d1 : DATE_AND_TIME := DATE_AND_TIME#2021-05-02-14:20:10.25;`
 - `d2 : DATE_AND_TIME := DATE_AND_TIME#1-12-24-00:00:1;`
 - `d3 : DATE_AND_TIME := DT#1999-12-31-23:59:59.999;`
@@ -36,7 +65,7 @@ format of `hh:mm:ss`.
 
 Note that only the seconeds-segment can have a fraction denoting the milliseconds.
 
-Example literals
+Examples
 - `t1 : TIME_OF_DAY := TIME_OF_DAY#14:20:10.25;`
 - `t2 : TIME_OF_DAY := TIME_OF_DY#0:00:1;`
 - `t3 : TIME_OF_DAY := TOD#23:59:59.999;`
@@ -55,7 +84,7 @@ TIME literals start with `TIME#` or `T#` followed by the `TIME` segements. Suppo
 
 Note that only the last segment of a `TIME` literal can have a fraction.
 
-Example literals
+Examples
 - `t1 : TIME := TIME#2d4h6m8s10ms;`
 - `t2 : TIME := T#2d4.2h;`
 - `t3 : TIME := T#-10s4ms16ns;`
diff --git a/src/ast.rs b/src/ast.rs
@@ -461,6 +461,7 @@ pub enum Statement {
     },
     LiteralString {
         value: String,
+        is_wide: bool,
         location: SourceRange,
     },
     LiteralArray {
@@ -623,9 +624,10 @@ impl Debug for Statement {
             Statement::LiteralBool { value, .. } => {
                 f.debug_struct("LiteralBool").field("value", value).finish()
             }
-            Statement::LiteralString { value, .. } => f
+            Statement::LiteralString { value, is_wide, .. } => f
                 .debug_struct("LiteralString")
                 .field("value", value)
+                .field("is_wide", is_wide)
                 .finish(),
             Statement::LiteralArray { elements, .. } => f
                 .debug_struct("LiteralArray")

diff --git a/src/codegen/generators/data_type_generator.rs b/src/codegen/generators/data_type_generator.rs
@@ -128,10 +128,11 @@ fn create_type<'ink>(
         DataTypeInformation::Float { size, .. } => {
             get_llvm_float_type(llvm.context, *size, name).map(|it| it.into())
         }
-        DataTypeInformation::String { size } => {
-            let gen_type = llvm.context.i8_type().array_type(*size).into();
-            Ok(gen_type)
-        }
+        DataTypeInformation::String { size, encoding } => Ok(llvm
+            .context
+            .i8_type()
+            .array_type(*size * encoding.get_bytes_per_char())
+            .into()),
         DataTypeInformation::SubRange {
             referenced_type, ..
         } => {

diff --git a/src/codegen/generators/expression_generator.rs b/src/codegen/generators/expression_generator.rs
@@ -1159,7 +1159,13 @@ impl<'a, 'b> ExpressionCodeGenerator<'a, 'b> {
                 self.llvm
                     .create_const_real(self.index, &self.get_type_context(), value)
             }
-            Statement::LiteralString { value, .. } => self.llvm.create_const_string(value.as_str()),
+            Statement::LiteralString { value, is_wide, .. } => {
+                if *is_wide {
+                    self.llvm.create_const_utf16_string(value.as_str())
+                } else {
+                    self.llvm.create_const_utf8_string(value.as_str())
+                }
+            }
             Statement::LiteralArray { elements, location } => {
                 self.generate_literal_array(elements, location)
             }

diff --git a/src/codegen/generators/llvm.rs b/src/codegen/generators/llvm.rs
@@ -221,14 +221,45 @@ impl<'a> Llvm<'a> {
         }
     }
 
-    /// create a constant string-value with the given value
+    /// create a constant utf8 string-value with the given value
     ///
     /// - `value` the value of the constant string value
-    pub fn create_const_string(&self, value: &str) -> Result<TypeAndValue<'a>, CompileError> {
+    pub fn create_const_utf8_string(&self, value: &str) -> Result<TypeAndValue<'a>, CompileError> {
         self.create_llvm_const_vec_string(value.as_bytes())
     }
 
-    /// create a constant string-value with the given value
+    /// create a constant utf16 string-value with the given value
+    ///
+    /// - `value` the value of the constant string value
+    pub fn create_const_utf16_string(&self, value: &str) -> Result<TypeAndValue<'a>, CompileError> {
+        let mut utf16_chars: Vec<u16> = value.encode_utf16().collect();
+        //it only contains a single NUL-terminator-byte so we add a second one
+        utf16_chars.push(0);
+        self.create_llvm_const_utf16_vec_string(utf16_chars.as_slice())
+    }
+
+    /// create a constant utf16 string-value with the given value
+    ///
+    /// - `value` the value of the constant string value
+    pub fn create_llvm_const_utf16_vec_string(
+        &self,
+        value: &[u16],
+    ) -> Result<TypeAndValue<'a>, CompileError> {
+        let mut bytes = Vec::with_capacity(value.len() * 2);
+        value.iter().for_each(|it| {
+            let ordered_bytes = it.to_le_bytes(); //todo make this a compiler-setting
+            bytes.push(ordered_bytes[0]);
+            bytes.push(ordered_bytes[1]);
+        });
+
+        let exp_value = self.context.const_string(bytes.as_slice(), false);
+        Ok((
+            typesystem::new_wide_string_information(value.len() as u32),
+            BasicValueEnum::VectorValue(exp_value),
+        ))
+    }
+
+    /// create a constant utf8 string-value with the given value
     ///
     /// - `value` the value of the constant string value
     pub fn create_llvm_const_vec_string(

diff --git a/src/codegen/tests/code_gen_tests.rs b/src/codegen/tests/code_gen_tests.rs
@@ -498,23 +498,27 @@ fn program_with_string_assignment() {
         r#"PROGRAM prg
 VAR
 y : STRING;
+z : WSTRING;
 END_VAR
 y := 'im a genius';
+z := "im a utf16 genius";
 END_PROGRAM
 "#
     );
 
     let expected = r#"; ModuleID = 'main'
 source_filename = "main"
 
-%prg_interface = type { [81 x i8] }
+%prg_interface = type { [81 x i8], [162 x i8] }
 
 @prg_instance = global %prg_interface zeroinitializer
 
 define void @prg(%prg_interface* %0) {
 entry:
   %y = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 0
+  %z = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 1
   store [12 x i8] c"im a genius\00", [81 x i8]* %y, align 1
+  store [36 x i8] c"i\00m\00 \00a\00 \00u\00t\00f\001\006\00 \00g\00e\00n\00i\00u\00s\00\00\00", [162 x i8]* %z, align 1
   ret void
 }
 "#;
@@ -527,31 +531,36 @@ fn program_with_string_type_assignment() {
     let result = codegen!(
         r#"
 TYPE MyString: STRING[99] := 'abc'; END_TYPE
+TYPE MyWString: WSTRING[99] := "abc"; END_TYPE
 
 PROGRAM prg
 VAR
 y : STRING;
 z : MyString;
+zz : MyWString;
 END_VAR
 y := 'im a genius';
 z := 'im also a genius';
+zz := "im also a genius";
 END_PROGRAM
 "#
     );
 
     let expected = r#"; ModuleID = 'main'
 source_filename = "main"
 
-%prg_interface = type { [81 x i8], [100 x i8] }
+%prg_interface = type { [81 x i8], [100 x i8], [200 x i8] }
 
-@prg_instance = global %prg_interface { [81 x i8] zeroinitializer, [4 x i8] c"abc\00" }
+@prg_instance = global %prg_interface { [81 x i8] zeroinitializer, [4 x i8] c"abc\00", [8 x i8] c"a\00b\00c\00\00\00" }
 
 define void @prg(%prg_interface* %0) {
 entry:
   %y = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 0
   %z = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 1
+  %zz = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 2
   store [12 x i8] c"im a genius\00", [81 x i8]* %y, align 1
   store [17 x i8] c"im also a genius\00", [100 x i8]* %z, align 1
+  store [34 x i8] c"i\00m\00 \00a\00l\00s\00o\00 \00a\00 \00g\00e\00n\00i\00u\00s\00\00\00", [200 x i8]* %zz, align 1
   ret void
 }
 "#;
@@ -566,24 +575,30 @@ fn variable_length_strings_can_be_created() {
           VAR
           y : STRING[15];
           z : STRING[3] := 'xyz';
+          wy : WSTRING[15];
+          wz : WSTRING[3] := "xyz";
           END_VAR
           y := 'im a genius';
+          wy := "im a genius";
         END_PROGRAM
         "#
     );
 
     let expected = r#"; ModuleID = 'main'
 source_filename = "main"
 
-%prg_interface = type { [16 x i8], [4 x i8] }
+%prg_interface = type { [16 x i8], [4 x i8], [32 x i8], [8 x i8] }
 
-@prg_instance = global %prg_interface { [16 x i8] zeroinitializer, [4 x i8] c"xyz\00" }
+@prg_instance = global %prg_interface { [16 x i8] zeroinitializer, [4 x i8] c"xyz\00", [32 x i8] zeroinitializer, [8 x i8] c"x\00y\00z\00\00\00" }
 
 define void @prg(%prg_interface* %0) {
 entry:
   %y = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 0
   %z = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 1
+  %wy = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 2
+  %wz = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 3
   store [12 x i8] c"im a genius\00", [16 x i8]* %y, align 1
+  store [24 x i8] c"i\00m\00 \00a\00 \00g\00e\00n\00i\00u\00s\00\00\00", [32 x i8]* %wy, align 1
   ret void
 }
 "#;

diff --git a/src/index/visitor.rs b/src/index/visitor.rs
@@ -280,13 +280,25 @@ fn visit_data_type(index: &mut Index, type_declatation: &UserTypeDeclaration) {
                 information,
             )
         }
-        DataType::StringType { name, size, .. } => {
+        DataType::StringType {
+            name,
+            size,
+            is_wide,
+            ..
+        } => {
             let size = if let Some(statement) = size {
                 evaluate_constant_int(&statement).unwrap() as u32
             } else {
                 crate::typesystem::DEFAULT_STRING_LEN // DEFAULT STRING LEN
             } + 1;
-            let information = DataTypeInformation::String { size };
+
+            let encoding = if *is_wide {
+                StringEncoding::Utf16
+            } else {
+                StringEncoding::Utf8
+            };
+
+            let information = DataTypeInformation::String { size, encoding };
             index.register_type(
                 name.as_ref().unwrap(),
                 type_declatation.initializer.clone(),

diff --git a/src/lexer.rs b/src/lexer.rs
@@ -272,6 +272,9 @@ pub enum Token {
     #[token("STRING")]
     KeywordString,
 
+    #[token("WSTRING")]
+    KeywordWideString,
+
     #[token("OF")]
     KeywordOf,
 
@@ -356,6 +359,9 @@ pub enum Token {
     #[regex("'((\\$.)|[^$'])*'")]
     LiteralString,
 
+    #[regex("\"((\\$.)|[^$\"])*\"")]
+    LiteralWideString,
+
     #[regex(r"[ \t\n\f]+", logos::skip)]
     End,
 }

diff --git a/src/lexer/tests/lexer_tests.rs b/src/lexer/tests/lexer_tests.rs
@@ -505,3 +505,26 @@ fn string_parsing() {
     assert_eq!("'AB$''", lexer.slice());
     lexer.advance();
 }
+
+#[test]
+fn wide_string_parsing() {
+    let mut lexer = lex(r#"
+    WSTRING 
+    "AB C" 
+    "AB$$" 
+    "AB$""
+    "#);
+
+    assert_eq!(lexer.token, KeywordWideString);
+    assert_eq!("WSTRING", lexer.slice());
+    lexer.advance();
+    assert_eq!(lexer.token, LiteralWideString);
+    assert_eq!(r#""AB C""#, lexer.slice());
+    lexer.advance();
+    assert_eq!(lexer.token, LiteralWideString);
+    assert_eq!(r#""AB$$""#, lexer.slice());
+    lexer.advance();
+    assert_eq!(lexer.token, LiteralWideString);
+    assert_eq!(r#""AB$"""#, lexer.slice());
+    lexer.advance();
+}
diff --git a/src/parser.rs b/src/parser.rs
@@ -356,7 +356,8 @@ fn parse_data_type_definition(
             },
             None,
         ))
-    } else if lexer.token == KeywordString {
+    } else if lexer.token == KeywordString || lexer.token == KeywordWideString {
+        let is_wide = lexer.token == KeywordWideString;
         lexer.advance();
         let size = if allow(KeywordSquareParensOpen, lexer) {
             let size_statement = parse_expression(lexer)?;
@@ -379,7 +380,7 @@ fn parse_data_type_definition(
             DataTypeDeclaration::DataTypeDefinition {
                 data_type: DataType::StringType {
                     name,
-                    is_wide: false,
+                    is_wide,
                     size,
                 },
             },

diff --git a/src/parser/expressions_parser.rs b/src/parser/expressions_parser.rs
@@ -217,7 +217,8 @@ fn parse_leaf_expression(lexer: &mut RustyLexer) -> Result<Statement, String> {
         LiteralTimeOfDay => parse_literal_time_of_day(lexer),
         LiteralTime => parse_literal_time(lexer),
         LiteralDateAndTime => parse_literal_date_and_time(lexer),
-        LiteralString => parse_literal_string(lexer),
+        LiteralString => parse_literal_string(lexer, false),
+        LiteralWideString => parse_literal_string(lexer, true),
         LiteralTrue => parse_bool_literal(lexer, true),
         LiteralFalse => parse_bool_literal(lexer, false),
         KeywordSquareParensOpen => parse_array_literal(lexer),
@@ -521,11 +522,12 @@ fn trim_quotes(quoted_string: &str) -> String {
     quoted_string[1..quoted_string.len() - 1].to_string()
 }
 
-fn parse_literal_string(lexer: &mut RustyLexer) -> Result<Statement, String> {
+fn parse_literal_string(lexer: &mut RustyLexer, is_wide: bool) -> Result<Statement, String> {
     let result = lexer.slice();
     let location = lexer.location();
     let string_literal = Ok(Statement::LiteralString {
         value: trim_quotes(result),
+        is_wide,
         location,
     });
     lexer.advance();