Rewrite unions such that all members are of the same size

In the C front-end, rewrite union components with types smaller than the union's size to anonymous structs. Each such struct contains the original union component plus padding. Assignments to union members thus always assign all bytes that make up the object representation of a union. The use of an anonymous struct ensures that member accesses can still be resolved. As this is a change in the semantics of goto programs, the goto binary version is incremented. rewrite_union is no longer necessary, but bugs (hidden by rewrite_union) in handling endianness in simplify_expr_member and convert_member surfaced and had to be fixed.
diffblue · Jan 14, 2021 · 1abfc62 · 1abfc62
1 parent 0df643e
commit 1abfc62
Show file tree

Hide file tree

Showing 24 changed files with 326 additions and 67 deletions.
diff --git a/jbmc/src/jdiff/jdiff_parse_options.cpp b/jbmc/src/jdiff/jdiff_parse_options.cpp
@@ -40,7 +40,6 @@ Author: Peter Schrammel
 #include <goto-programs/remove_unused_functions.h>
 #include <goto-programs/remove_vector.h>
 #include <goto-programs/remove_virtual_functions.h>
-#include <goto-programs/rewrite_union.h>
 #include <goto-programs/set_properties.h>
 #include <goto-programs/show_properties.h>
 #include <goto-programs/string_abstraction.h>
@@ -296,7 +295,6 @@ bool jdiff_parse_optionst::process_goto_program(
     remove_returns(goto_model);
     remove_vector(goto_model);
     remove_complex(goto_model);
-    rewrite_union(goto_model);
 
     // add generic checks
     log.status() << "Generic Property Instrumentation" << messaget::eom;

diff --git a/regression/ansi-c/arch_flags_mcpu_bad/object.intel b/regression/ansi-c/arch_flags_mcpu_bad/object.intel
diff --git a/regression/ansi-c/arch_flags_mcpu_bad/test.desc b/regression/ansi-c/arch_flags_mcpu_bad/test.desc
@@ -11,7 +11,7 @@ This tests the -mcpu=cortex=a15 flag that should activate ARM-32 mode.
 The object file 'object.intel' was compiled from 'source.c' with goto-cc
 on a 64-bit platform:
 
-  goto-cc -c source.c
+  goto-cc -c source.c -o object.intel
 
 preproc.i is already pre-processed so that it can be linked in without
 needing to invoke a pre-processor from a cross-compile toolchain on your

diff --git a/regression/ansi-c/arch_flags_mcpu_good/object.arm b/regression/ansi-c/arch_flags_mcpu_good/object.arm
diff --git a/regression/ansi-c/arch_flags_mthumb_bad/object.intel b/regression/ansi-c/arch_flags_mthumb_bad/object.intel
diff --git a/regression/ansi-c/arch_flags_mthumb_bad/test.desc b/regression/ansi-c/arch_flags_mthumb_bad/test.desc
@@ -11,7 +11,7 @@ This tests the -mthumb flag that should activate ARM-32 mode. The object
 file 'object.intel' was compiled from 'source.c' with goto-cc on a
 64-bit platform:
 
-  goto-cc -c source.c
+  goto-cc -c source.c -o object.intel
 
 preproc.i is already pre-processed so that it can be linked in without
 needing to invoke a pre-processor from a cross-compile toolchain on your

diff --git a/regression/ansi-c/arch_flags_mthumb_good/object.arm b/regression/ansi-c/arch_flags_mthumb_good/object.arm
diff --git a/regression/cbmc-library/float-nan-check/test.desc b/regression/cbmc-library/float-nan-check/test.desc
@@ -1,15 +1,15 @@
 CORE
 main.c
 --nan-check
-\[main.NaN.1\] line \d+ NaN on \+ in byte_extract_little_endian\(c, (0|0l|0ll), float\) \+ myzero: SUCCESS
+\[main.NaN.1\] line \d+ NaN on \+ in c\.f \+ myzero: SUCCESS
 \[main.NaN.2\] line \d+ NaN on / in myzero / myzero: FAILURE
 \[main.NaN.3\] line \d+ NaN on / in \(float\)n / myinf: FAILURE
 \[main.NaN.4\] line \d+ NaN on \* in myinf \* myzero: FAILURE
 \[main.NaN.5\] line \d+ NaN on \+ in -myinf \+ myinf: FAILURE
 \[main.NaN.6\] line \d+ NaN on - in myinf - myinf: FAILURE
 \[main.NaN.7\] line \d+ NaN on - in -myinf - -myinf: FAILURE
-\[main.NaN.8\] line \d+ NaN on \+ in byte_extract_little_endian\(c, (0|0l|0ll), float\) \+ byte_extract_little_endian\(c, (0|0l|0ll), float\): SUCCESS
-\[main.NaN.9\] line \d+ NaN on / in byte_extract_little_endian\(c, (0|0l|0ll), float\) / myzero: SUCCESS
+\[main.NaN.8\] line \d+ NaN on \+ in c\.f \+ c\.f: SUCCESS
+\[main.NaN.9\] line \d+ NaN on / in c\.f / myzero: SUCCESS
 \[main.NaN.10\] line \d+ NaN on \* in mynan \* mynan: SUCCESS
 \[main.NaN.11\] line \d+ NaN on \+ in mynan \+ mynan: SUCCESS
 \[main.NaN.12\] line \d+ NaN on - in mynan - mynan: SUCCESS

diff --git a/regression/cbmc/union7/no_simplify.desc b/regression/cbmc/union7/no_simplify.desc
@@ -0,0 +1,11 @@
+CORE broken-smt-backend
+main.c
+--big-endian --no-simplify
+^EXIT=0$
+^SIGNAL=0$
+^VERIFICATION SUCCESSFUL$
+--
+^warning: ignoring
+--
+Although this test can now be fully solved via constant propagation and
+simplification, it should also succeed without simplification.
diff --git a/regression/cbmc/union7/test.desc b/regression/cbmc/union7/test.desc
@@ -1,11 +1,11 @@
-CORE broken-smt-backend
+CORE
 main.c
 --big-endian
+Generated 3 VCC\(s\), 0 remaining after simplification
 ^EXIT=0$
 ^SIGNAL=0$
 ^VERIFICATION SUCCESSFUL$
 --
 ^warning: ignoring
 --
-This test reports a VERIFICATION ERROR when running with the SMT2 solver on
-Windows only.
+This test can now be fully solved via constant propagation and simplification.
diff --git a/regression/cbmc/xml-trace/test.desc b/regression/cbmc/xml-trace/test.desc
@@ -7,7 +7,7 @@ VERIFICATION FAILED
 <assignment assignment_type="state" base_name="u" display_name="test::u" hidden="false" identifier="test::u" mode="C" step_nr="\d+" thread="0">
 <location file=".*" function="test" line="\d+" working-directory=".*"/>
 <type>union myunion</type>
-<full_lhs>byte_extract_little_endian\(u, 0ll?, .*int.*\)</full_lhs>
+<full_lhs>u</full_lhs>
 <full_lhs_value binary="[01]+">\d+ll?</full_lhs_value>
 <value>\{ \.i=\d+ll? \}</value>
 <value_expression>

diff --git a/scripts/expected_doxygen_warnings.txt b/scripts/expected_doxygen_warnings.txt
@@ -79,13 +79,13 @@
 /home/runner/work/cbmc/cbmc/src/analyses/variable-sensitivity/variable_sensitivity_object_factory.cpp:188: warning: The following parameter of initialize_abstract_object(const typet type, bool top, bool bottom, const exprt &e, const abstract_environmentt &environment, const namespacet &ns, const vsd_configt &configuration) is not documented:
   parameter 'configuration'
 
-warning: Include graph for 'cbmc_parse_options.cpp' not generated, too many nodes (61), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.
+warning: Include graph for 'cbmc_parse_options.cpp' not generated, too many nodes (60), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.
 warning: Include graph for 'goto_instrument_parse_options.cpp' not generated, too many nodes (97), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.
 warning: Included by graph for 'goto_functions.h' not generated, too many nodes (66), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.
 warning: Included by graph for 'goto_model.h' not generated, too many nodes (109), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.
 warning: Included by graph for 'arith_tools.h' not generated, too many nodes (181), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.
-warning: Included by graph for 'c_types.h' not generated, too many nodes (110), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.
-warning: Included by graph for 'config.h' not generated, too many nodes (85), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.
+warning: Included by graph for 'c_types.h' not generated, too many nodes (111), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.
+warning: Included by graph for 'config.h' not generated, too many nodes (86), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.
 warning: Included by graph for 'exception_utils.h' not generated, too many nodes (61), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.
 warning: Included by graph for 'expr.h' not generated, too many nodes (88), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.
 warning: Included by graph for 'expr_util.h' not generated, too many nodes (60), threshold is 60. Consider increasing DOT_GRAPH_MAX_NODES.

diff --git a/src/ansi-c/c_typecheck_expr.cpp b/src/ansi-c/c_typecheck_expr.cpp
@@ -17,6 +17,7 @@ Author: Daniel Kroening, kroening@kroening.com
 #include <util/c_types.h>
 #include <util/config.h>
 #include <util/cprover_prefix.h>
+#include <util/expr_initializer.h>
 #include <util/expr_util.h>
 #include <util/ieee_float.h>
 #include <util/mathematical_expr.h>
@@ -1086,6 +1087,36 @@ void c_typecheck_baset::typecheck_expr_typecast(exprt &expr)
         expr.set(ID_C_lvalue, true);
         return;
       }
+      else if(c.get_anonymous() && c.type().id() == ID_struct_tag)
+      {
+        // we also look into anonymous members as we might have wrapped a
+        // non-maximal member in such a struct
+        const struct_typet &struct_type =
+          follow_tag(to_struct_tag_type(c.type()));
+        if(
+          !struct_type.components().empty() &&
+          struct_type.components().front().type() == op.type())
+        {
+          auto nondet =
+            nondet_initializer(c.type(), expr.source_location(), *this);
+          if(!nondet.has_value())
+          {
+            error().source_location = expr.source_location();
+            error() << "cannot nondet-initialize union component of type '"
+                    << to_string(c.type()) << "'" << eom;
+            throw 0;
+          }
+          CHECK_RETURN(nondet->id() == ID_struct);
+          CHECK_RETURN(!nondet->operands().empty());
+
+          nondet->operands().front() = op;
+          union_exprt union_expr(c.get_name(), *nondet, expr.type());
+          union_expr.add_source_location() = expr.source_location();
+          expr = union_expr;
+          expr.set(ID_C_lvalue, true);
+          return;
+        }
+      }
     }
 
     // not found, complain

diff --git a/src/ansi-c/c_typecheck_initializer.cpp b/src/ansi-c/c_typecheck_initializer.cpp
@@ -530,27 +530,36 @@ exprt::operandst::const_iterator c_typecheck_baset::do_designated_initializer(
         // initializer; other bytes have an unspecified value (C Standard
         // 6.2.6.1(7)). In practice, objects of static lifetime are fully zero
         // initialized.
-        const auto zero =
-          zero_initializer(component.type(), value.source_location(), *this);
-        if(!zero.has_value())
-        {
-          error().source_location = value.source_location();
-          error() << "cannot zero-initialize union component of type '"
-                  << to_string(component.type()) << "'" << eom;
-          throw 0;
-        }
-
         if(current_symbol.is_static_lifetime)
         {
-          byte_update_exprt byte_update{
-            byte_update_id(), *dest, from_integer(0, index_type()), *zero};
-          byte_update.add_source_location() = value.source_location();
-          *dest = std::move(byte_update);
-          dest = &(to_byte_update_expr(*dest).op2());
+          const auto zero =
+            zero_initializer(component.type(), value.source_location(), *this);
+          if(!zero.has_value())
+          {
+            error().source_location = value.source_location();
+            error() << "cannot zero-initialize union component of type '"
+                    << to_string(component.type()) << "'" << eom;
+            throw 0;
+          }
+
+          union_exprt union_expr(component.get_name(), *zero, type);
+          union_expr.add_source_location() = value.source_location();
+          *dest = std::move(union_expr);
+          dest = &(to_union_expr(*dest).op());
         }
         else
         {
-          union_exprt union_expr(component.get_name(), *zero, type);
+          const auto nondet = nondet_initializer(
+            component.type(), value.source_location(), *this);
+          if(!nondet.has_value())
+          {
+            error().source_location = value.source_location();
+            error() << "cannot nondet-initialize union component of type '"
+                    << to_string(component.type()) << "'" << eom;
+            throw 0;
+          }
+
+          union_exprt union_expr(component.get_name(), *nondet, type);
           union_expr.add_source_location() = value.source_location();
           *dest = std::move(union_expr);
           dest = &(to_union_expr(*dest).op());

diff --git a/src/ansi-c/c_typecheck_type.cpp b/src/ansi-c/c_typecheck_type.cpp
@@ -1072,6 +1072,134 @@ void c_typecheck_baset::typecheck_compound_body(
     else
       it++;
   }
+
+  if(type.id() == ID_union && !components.empty())
+  {
+    const auto widest_member =
+      to_union_type(type).find_widest_union_component(*this);
+    optionalt<exprt> size_expr_opt;
+
+    if(!widest_member.has_value())
+    {
+      // GCC extension: arrays of non-constant size
+      size_expr_opt = size_of_expr(type, *this);
+
+      if(!size_expr_opt.has_value())
+      {
+        error().source_location = type.source_location();
+        error() << "cannot determine size of union" << eom;
+        throw 0;
+      }
+    }
+
+    for(auto &component : components)
+    {
+      std::vector<typet> padding_types;
+
+      const auto &component_bits = pointer_offset_bits(component.type(), *this);
+      if(
+        component_bits.has_value() &&
+        (!widest_member.has_value() ||
+         *component_bits < widest_member->second) &&
+        *component_bits % 8 != 0)
+      {
+        std::size_t bit_field_padding_width =
+          8 - numeric_cast_v<std::size_t>(*component_bits % 8);
+        padding_types.push_back(c_bit_field_typet{
+          unsignedbv_typet{bit_field_padding_width}, bit_field_padding_width});
+      }
+
+      if(size_expr_opt.has_value())
+      {
+        const auto component_size_expr_opt =
+          size_of_expr(component.type(), *this);
+        CHECK_RETURN(component_size_expr_opt.has_value());
+        // size_of_expr will round up to full bytes, and also works for bit
+        // fields (which C sizeof does not support). Thus, we don't need to
+        // special-case bit_field_padding_opt being set.
+        if_exprt padding_bytes{
+          binary_relation_exprt{
+            *component_size_expr_opt, ID_lt, *size_expr_opt},
+          minus_exprt{*size_expr_opt, *component_size_expr_opt},
+          from_integer(0, size_expr_opt->type())};
+        simplify(padding_bytes, *this);
+
+        if(!padding_bytes.is_zero())
+        {
+          make_index_type(padding_bytes);
+
+          padding_types.push_back(
+            array_typet{unsigned_char_type(), padding_bytes});
+        }
+      }
+      else
+      {
+        CHECK_RETURN(component_bits.has_value());
+        // Truncating division ensures that we don't need to special-case
+        // bit_field_padding_opt being set.
+        std::size_t padding_bytes =
+          numeric_cast_v<std::size_t>(widest_member->second - *component_bits) /
+          8;
+
+        if(padding_bytes != 0)
+        {
+          padding_types.push_back(array_typet{
+            unsigned_char_type(), from_integer(padding_bytes, index_type())});
+        }
+      }
+
+      if(padding_types.empty())
+        continue;
+
+      struct_typet::componentst component_with_padding_type;
+      component_with_padding_type.push_back(component);
+      for(const auto &t : padding_types)
+      {
+        // TODO: perhaps use the same naming as padding.cpp uses
+        component_with_padding_type.push_back(struct_union_typet::componentt{
+          "$anon" + std::to_string(anon_member_counter++), t});
+        // TODO: not sure whether this should be anonymous
+        component_with_padding_type.back().set_anonymous(true);
+        component_with_padding_type.back().set_is_padding(true);
+        component_with_padding_type.back().add_source_location() =
+          component.source_location();
+      }
+
+      struct_union_typet::componentt component_with_padding{
+        "$anon" + std::to_string(anon_member_counter++),
+        struct_typet{component_with_padding_type}};
+      component_with_padding.set_anonymous(true);
+      component_with_padding.add_source_location() =
+        component.source_location();
+
+      // based on typecheck_compound_type, branch
+      // "if(type.find(ID_tag).is_nil())"
+      // produce type symbol
+      type_symbolt compound_symbol{component_with_padding.type()};
+      compound_symbol.location = component_with_padding.source_location();
+
+      std::string typestr = type2name(compound_symbol.type, *this);
+      compound_symbol.base_name = "#anon#" + typestr;
+      compound_symbol.name = "tag-#anon#" + typestr;
+
+      // We might already have the same anonymous struct, and this is simply
+      // ok as those are exactly the same types, just introduced at a
+      // different source location.
+      symbolt *new_symbol = &compound_symbol;
+      if(
+        symbol_table.symbols.find(compound_symbol.name) ==
+        symbol_table.symbols.end())
+      {
+        move_symbol(compound_symbol, new_symbol);
+      }
+
+      struct_tag_typet tag_type{new_symbol->name};
+      tag_type.add_source_location() = component_with_padding.source_location();
+      component_with_padding.type().swap(tag_type);
+
+      component.swap(component_with_padding);
+    }
+  }
 }
 
 typet c_typecheck_baset::enum_constant_type(

diff --git a/src/cbmc/cbmc_parse_options.cpp b/src/cbmc/cbmc_parse_options.cpp
@@ -62,7 +62,6 @@ Author: Daniel Kroening, kroening@kroening.com
 #include <goto-programs/remove_skip.h>
 #include <goto-programs/remove_unused_functions.h>
 #include <goto-programs/remove_vector.h>
-#include <goto-programs/rewrite_union.h>
 #include <goto-programs/set_properties.h>
 #include <goto-programs/show_goto_functions.h>
 #include <goto-programs/show_properties.h>
@@ -943,7 +942,6 @@ bool cbmc_parse_optionst::process_goto_program(
   remove_returns(goto_model);
   remove_vector(goto_model);
   remove_complex(goto_model);
-  rewrite_union(goto_model);
 
   // add generic checks
   log.status() << "Generic Property Instrumentation" << messaget::eom;

diff --git a/src/goto-diff/goto_diff_parse_options.cpp b/src/goto-diff/goto_diff_parse_options.cpp
@@ -40,7 +40,6 @@ Author: Peter Schrammel
 #include <goto-programs/remove_unused_functions.h>
 #include <goto-programs/remove_vector.h>
 #include <goto-programs/remove_virtual_functions.h>
-#include <goto-programs/rewrite_union.h>
 #include <goto-programs/set_properties.h>
 #include <goto-programs/show_properties.h>
 #include <goto-programs/string_abstraction.h>
@@ -341,7 +340,6 @@ bool goto_diff_parse_optionst::process_goto_program(
     remove_returns(goto_model);
     remove_vector(goto_model);
     remove_complex(goto_model);
-    rewrite_union(goto_model);
 
     // add generic checks
     log.status() << "Generic Property Instrumentation" << messaget::eom;

diff --git a/src/goto-programs/write_goto_binary.h b/src/goto-programs/write_goto_binary.h
@@ -12,7 +12,7 @@ Author: CM Wintersteiger
 #ifndef CPROVER_GOTO_PROGRAMS_WRITE_GOTO_BINARY_H
 #define CPROVER_GOTO_PROGRAMS_WRITE_GOTO_BINARY_H
 
-#define GOTO_BINARY_VERSION 5
+#define GOTO_BINARY_VERSION 6
 
 #include <iosfwd>
 #include <string>