Skip to content

Commit 163a0c6

Browse files
authored
annotate and generate STRING-literals precisly according to their length (#421)
he annotator tells us exactly how long every string-literal is by craeting dedicated STRING-types for them. The expression generator now applies two different strategies: when generating string-literals in declarations (outside of a body) we generate the full length of the string-literal. So a STRING[80] = 'a' will generate 80 additional \00 to fill the full vector when generating string-literals in the body we generate only the string's real length. So an assignment: x := 'a'; will generate an vec of length 2 with 'a' and \00 in it (so we don't fill x. fixes #417
1 parent 61dd6d3 commit 163a0c6

22 files changed

+249
-98
lines changed

src/codegen/generators/expression_generator.rs

+34-7
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ pub struct ExpressionCodeGenerator<'a, 'b> {
4444
pub temp_variable_prefix: String,
4545
/// the string-suffix to use for temporary variables
4646
pub temp_variable_suffix: String,
47+
48+
// the function on how to obtain the the length to use for the string
49+
string_len_provider: fn(type_length_declaration: usize, actual_length: usize) -> usize,
4750
}
4851

4952
/// context information to generate a parameter
@@ -77,6 +80,7 @@ impl<'a, 'b> ExpressionCodeGenerator<'a, 'b> {
7780
function_context: Some(function_context),
7881
temp_variable_prefix: "load_".to_string(),
7982
temp_variable_suffix: "".to_string(),
83+
string_len_provider: |_, actual_length| actual_length, //when generating string-literals in a body, use the actual length
8084
}
8185
}
8286

@@ -101,6 +105,7 @@ impl<'a, 'b> ExpressionCodeGenerator<'a, 'b> {
101105
function_context: None,
102106
temp_variable_prefix: "load_".to_string(),
103107
temp_variable_suffix: "".to_string(),
108+
string_len_provider: |type_length_declaration, _| type_length_declaration, //when generating string-literals in declarations, use the declared length
104109
}
105110
}
106111

@@ -1409,12 +1414,35 @@ impl<'a, 'b> ExpressionCodeGenerator<'a, 'b> {
14091414
} => {
14101415
let expected_type = self.get_type_hint_info_for(literal_statement)?;
14111416
match expected_type {
1412-
DataTypeInformation::String { encoding, .. } => match encoding {
1413-
StringEncoding::Utf8 => self.llvm.create_const_utf8_string(value.as_str()),
1414-
StringEncoding::Utf16 => {
1415-
self.llvm.create_const_utf16_string(value.as_str())
1417+
DataTypeInformation::String { encoding, size, .. } => {
1418+
let declared_length = size.as_int_value(self.index).map_err(|msg| {
1419+
Diagnostic::codegen_error(
1420+
format!("Unable to generate string-literal: {}", msg).as_str(),
1421+
literal_statement.get_location(),
1422+
)
1423+
})? as usize;
1424+
1425+
match encoding {
1426+
StringEncoding::Utf8 => {
1427+
//note that .len() will give us the number of bytes, not the number of characters
1428+
let actual_length = value.chars().count() + 1; // +1 to account for a final \0
1429+
let str_len = std::cmp::min(
1430+
(self.string_len_provider)(declared_length, actual_length),
1431+
declared_length,
1432+
);
1433+
self.llvm.create_const_utf8_string(value.as_str(), str_len)
1434+
}
1435+
StringEncoding::Utf16 => {
1436+
//note that .len() will give us the number of bytes, not the number of characters
1437+
let actual_length = value.encode_utf16().count() + 1; // +1 to account for a final \0
1438+
let str_len = std::cmp::min(
1439+
(self.string_len_provider)(declared_length, actual_length),
1440+
declared_length,
1441+
);
1442+
self.llvm.create_const_utf16_string(value.as_str(), str_len)
1443+
}
14161444
}
1417-
},
1445+
}
14181446
DataTypeInformation::Integer { size: 8, .. }
14191447
if expected_type.is_character() =>
14201448
{
@@ -1630,10 +1658,9 @@ impl<'a, 'b> ExpressionCodeGenerator<'a, 'b> {
16301658
.ok_or_else(|| {
16311659
Diagnostic::codegen_error("Cannot generate empty array", location.clone())
16321660
}) //TODO
1633-
.and_then(|it| self.get_type_hint_info_for(it))?;
1661+
.and_then(|it| self.get_type_hint_for(it))?;
16341662

16351663
let llvm_type = self.llvm_index.get_associated_type(inner_type.get_name())?;
1636-
16371664
let mut v = Vec::new();
16381665
for e in elements {
16391666
//generate with correct type hint

src/codegen/generators/llvm.rs

+27-7
Original file line numberDiff line numberDiff line change
@@ -200,17 +200,33 @@ impl<'a> Llvm<'a> {
200200
/// create a constant utf8 string-value with the given value
201201
///
202202
/// - `value` the value of the constant string value
203-
pub fn create_const_utf8_string(&self, value: &str) -> Result<BasicValueEnum<'a>, Diagnostic> {
204-
self.create_llvm_const_vec_string(value.as_bytes())
203+
pub fn create_const_utf8_string(
204+
&self,
205+
value: &str,
206+
len: usize,
207+
) -> Result<BasicValueEnum<'a>, Diagnostic> {
208+
let mut utf8_chars = value.as_bytes()[..std::cmp::min(value.len(), len - 1)].to_vec();
209+
//fill the 0 terminators
210+
while utf8_chars.len() < len {
211+
utf8_chars.push(0);
212+
}
213+
self.create_llvm_const_vec_string(utf8_chars.as_slice())
205214
}
206215

207216
/// create a constant utf16 string-value with the given value
208217
///
209218
/// - `value` the value of the constant string value
210-
pub fn create_const_utf16_string(&self, value: &str) -> Result<BasicValueEnum<'a>, Diagnostic> {
219+
/// - `len` the len of the string, the literal will be right-padded with 0-bytes to match the length
220+
pub fn create_const_utf16_string(
221+
&self,
222+
value: &str,
223+
len: usize,
224+
) -> Result<BasicValueEnum<'a>, Diagnostic> {
211225
let mut utf16_chars: Vec<u16> = value.encode_utf16().collect();
212-
//it only contains a single NUL-terminator-byte so we add a second one
213-
utf16_chars.push(0);
226+
//fill the 0 terminators
227+
while utf16_chars.len() < len {
228+
utf16_chars.push(0);
229+
}
214230
self.create_llvm_const_utf16_vec_string(utf16_chars.as_slice())
215231
}
216232

@@ -235,8 +251,12 @@ impl<'a> Llvm<'a> {
235251
&self,
236252
value: &[u8],
237253
) -> Result<BasicValueEnum<'a>, Diagnostic> {
238-
let exp_value = self.context.const_string(value, true);
239-
Ok(BasicValueEnum::VectorValue(exp_value))
254+
let values: Vec<IntValue> = value
255+
.iter()
256+
.map(|it| self.context.i8_type().const_int(*it as u64, false))
257+
.collect();
258+
let vector = self.context.i8_type().const_array(&values);
259+
Ok(BasicValueEnum::ArrayValue(vector))
240260
}
241261

242262
/// create a constant i8 character (IntValue) with the given value

src/codegen/llvm_typesystem.rs

+3-54
Original file line numberDiff line numberDiff line change
@@ -267,10 +267,10 @@ pub fn cast_if_needed<'ctx>(
267267
statement.get_location(),
268268
)),
269269
},
270-
DataTypeInformation::String { size, encoding } => match value_type {
270+
DataTypeInformation::String { encoding, .. } => match value_type {
271271
DataTypeInformation::String {
272-
size: value_size,
273272
encoding: value_encoding,
273+
..
274274
} => {
275275
if encoding != value_encoding {
276276
return Err(Diagnostic::casting_error(
@@ -279,58 +279,7 @@ pub fn cast_if_needed<'ctx>(
279279
statement.get_location(),
280280
));
281281
}
282-
let size = size.as_int_value(index).map_err(|msg| {
283-
Diagnostic::codegen_error(msg.as_str(), SourceRange::undefined())
284-
})? as u32;
285-
let value_size = value_size.as_int_value(index).map_err(|msg| {
286-
Diagnostic::codegen_error(msg.as_str(), SourceRange::undefined())
287-
})? as u32;
288-
289-
if size < value_size {
290-
//we need to downcast the size of the string
291-
//check if it's a literal, if so we can exactly know how big this is
292-
if let AstStatement::LiteralString {
293-
is_wide,
294-
value: string_value,
295-
..
296-
} = statement
297-
{
298-
let value = if *is_wide {
299-
let mut chars = string_value.encode_utf16().collect::<Vec<u16>>();
300-
//We add a null terminator since the llvm command will not account for
301-
//it
302-
chars.push(0);
303-
let total_bytes_to_copy = std::cmp::min(size, chars.len() as u32);
304-
let new_value = &chars[0..(total_bytes_to_copy) as usize];
305-
llvm.create_llvm_const_utf16_vec_string(new_value)?
306-
} else {
307-
let bytes = string_value.bytes().collect::<Vec<u8>>();
308-
let total_bytes_to_copy = std::cmp::min(size - 1, bytes.len() as u32);
309-
let new_value = &bytes[0..total_bytes_to_copy as usize];
310-
//This accounts for a null terminator, hence we don't add it here.
311-
llvm.create_llvm_const_vec_string(new_value)?
312-
};
313-
Ok(value)
314-
} else {
315-
//if we are on a vector replace it
316-
if value.is_vector_value() {
317-
let vec_value = value.into_vector_value();
318-
let string_value = vec_value.get_string_constant().to_bytes();
319-
let real_size = std::cmp::min(size, (string_value.len() + 1) as u32);
320-
if real_size < value_size {
321-
let new_value = &string_value[0..(real_size - 1) as usize];
322-
let value = llvm.create_llvm_const_vec_string(new_value)?;
323-
Ok(value)
324-
} else {
325-
Ok(value)
326-
}
327-
} else {
328-
Ok(value) //Don't break, just don't cast
329-
}
330-
}
331-
} else {
332-
Ok(value)
333-
}
282+
Ok(value)
334283
}
335284
_ => Err(Diagnostic::casting_error(
336285
value_type.get_name(),

src/codegen/tests/initialization_test/global_initializers.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ fn initial_values_in_global_constant_variables() {
1010
1111
c_BOOL : BOOL := TRUE;
1212
c_not : BOOL := NOT c_BOOL;
13-
c_str : STRING := 'Hello';
14-
c_wstr : WSTRING := "World";
13+
c_str : STRING[10] := 'Hello';
14+
c_wstr : WSTRING[10] := "World";
1515
1616
c_real : REAL := 3.14;
1717
c_lreal : LREAL := 3.1415;
@@ -26,8 +26,8 @@ fn initial_values_in_global_constant_variables() {
2626
nb : BOOL := c_not;
2727
bb : BOOL := c_not AND NOT c_not;
2828
29-
str : STRING := c_str;
30-
wstr : WSTRING := c_wstr;
29+
str : STRING[10] := c_str;
30+
wstr : WSTRING[10] := c_wstr;
3131
3232
r : REAL := c_real / 2;
3333
tau : LREAL := 2 * c_lreal;

src/codegen/tests/initialization_test/snapshots/rusty__codegen__tests__initialization_test__global_initializers__initial_values_in_global_constant_variables.snap

+4-4
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ source_filename = "main"
1111
@c_3c = unnamed_addr constant i16 21
1212
@c_BOOL = unnamed_addr constant i1 true
1313
@c_not = unnamed_addr constant i1 false
14-
@c_str = unnamed_addr constant [81 x i8] c"Hello\00"
15-
@c_wstr = unnamed_addr constant [81 x i16] [i16 87, i16 111, i16 114, i16 108, i16 100, i16 0]
14+
@c_str = unnamed_addr constant [11 x i8] c"Hello\00\00\00\00\00\00"
15+
@c_wstr = unnamed_addr constant [11 x i16] [i16 87, i16 111, i16 114, i16 108, i16 100, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0]
1616
@c_real = unnamed_addr constant float 0x40091EB860000000
1717
@c_lreal = unnamed_addr constant double 3.141500e+00
1818
@x = unnamed_addr constant i16 7
@@ -21,8 +21,8 @@ source_filename = "main"
2121
@b = unnamed_addr constant i1 true
2222
@nb = unnamed_addr constant i1 false
2323
@bb = unnamed_addr constant i1 false
24-
@str = unnamed_addr constant [81 x i8] c"Hello\00"
25-
@wstr = unnamed_addr constant [81 x i16] [i16 87, i16 111, i16 114, i16 108, i16 100, i16 0]
24+
@str = unnamed_addr constant [11 x i8] c"Hello\00\00\00\00\00\00"
25+
@wstr = unnamed_addr constant [11 x i16] [i16 87, i16 111, i16 114, i16 108, i16 100, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0]
2626
@r = unnamed_addr constant float 0x3FF91EB860000000
2727
@tau = unnamed_addr constant double 6.283000e+00
2828

Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
---
22
source: src/codegen/tests/initialization_test/type_initializers.rs
3+
assertion_line: 93
34
expression: result
45

56
---
@@ -8,5 +9,5 @@ source_filename = "main"
89

910
@arr = global [5 x i16] [i16 1, i16 2, i16 3]
1011
@b_exp = global [6 x i32] [i32 4, i32 6, i32 6, i32 10]
11-
@str = global [4 x [81 x i8]] [[81 x i8] c"first\00", [81 x i8] c"second\00"]
12+
@str = global [4 x [81 x i8]] [[81 x i8] c"first\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", [81 x i8] c"second\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00"]
1213

src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__expression_list_as_array_initilization.snap

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ source_filename = "main"
88

99
@arr = global [4 x i16] [i16 1, i16 2, i16 3]
1010
@b_exp = global [5 x i32] [i32 4, i32 6, i32 6, i32 10]
11-
@str = global [3 x [81 x i8]] [[81 x i8] c"first\00", [81 x i8] c"second\00"]
11+
@str = global [3 x [81 x i8]] [[81 x i8] c"first\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", [81 x i8] c"second\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00"]
1212

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
source: src/codegen/tests/code_gen_tests.rs
3+
expression: result
4+
5+
---
6+
; ModuleID = 'main'
7+
source_filename = "main"
8+
9+
@a = global [7 x [11 x i8]] [[11 x i8] c"Monday\00\00\00\00\00", [11 x i8] c"Tuesday\00\00\00\00", [11 x i8] c"Wednesday\00\00", [11 x i8] c"Thursday\00\00\00", [11 x i8] c"Friday\00\00\00\00\00", [11 x i8] c"Saturday\00\00\00", [11 x i8] c"Sunday\00\00\00\00\00"]
10+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
source: src/codegen/tests/code_gen_tests.rs
3+
expression: result
4+
5+
---
6+
; ModuleID = 'main'
7+
source_filename = "main"
8+
9+
@b = global [11 x i8] c"Monday\00\00\00\00\00"
10+

src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__program_with_special_chars_in_string.snap

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
---
22
source: src/codegen/tests/code_gen_tests.rs
3+
assertion_line: 485
34
expression: result
45

56
---
@@ -16,7 +17,7 @@ entry:
1617
%should_not_replace_s = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 1
1718
%should_replace_ws = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 2
1819
%should_not_replace_ws = getelementptr inbounds %prg_interface, %prg_interface* %0, i32 0, i32 3
19-
store [41 x i8] c"a\0A\0A b\0A\0A c\0C\0C d\0D\0D e\09\09 $ 'single' W\F0\9F\92\96\F0\9F\92\96\00", [81 x i8]* %should_replace_s, align 1
20+
store [35 x i8] c"a\0A\0A b\0A\0A c\0C\0C d\0D\0D e\09\09 $ 'single' W\F0\9F\00", [81 x i8]* %should_replace_s, align 1
2021
store [19 x i8] c"\0043 $\22no replace$\22\00", [81 x i8]* %should_not_replace_s, align 1
2122
store [37 x i16] [i16 97, i16 10, i16 10, i16 32, i16 98, i16 10, i16 10, i16 32, i16 99, i16 12, i16 12, i16 32, i16 100, i16 13, i16 13, i16 32, i16 101, i16 9, i16 9, i16 32, i16 36, i16 32, i16 34, i16 100, i16 111, i16 117, i16 98, i16 108, i16 101, i16 34, i16 32, i16 87, i16 -10179, i16 -9066, i16 -10179, i16 -9066, i16 0], [81 x i16]* %should_replace_ws, align 2
2223
store [19 x i16] [i16 36, i16 52, i16 51, i16 32, i16 36, i16 39, i16 110, i16 111, i16 32, i16 114, i16 101, i16 112, i16 108, i16 97, i16 99, i16 101, i16 36, i16 39, i16 0], [81 x i16]* %should_not_replace_ws, align 2

src/codegen/tests/snapshots/rusty__codegen__tests__code_gen_tests__program_with_string_assignment.snap

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
---
22
source: src/codegen/tests/code_gen_tests.rs
3+
assertion_line: 462
34
expression: result
45

56
---

src/codegen/tests/snapshots/rusty__codegen__tests__string_tests__function_parameters_string.snap

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
---
22
source: src/codegen/tests/string_tests.rs
3+
assertion_line: 146
34
expression: program
45

56
---

0 commit comments

Comments
 (0)