diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 00000000..97f7bb31 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,3 @@ +*.idea +*.html +.~lock* diff --git a/docs/sphinx/.gitignore b/docs/sphinx/.gitignore new file mode 100644 index 00000000..7c5f0a37 --- /dev/null +++ b/docs/sphinx/.gitignore @@ -0,0 +1,2 @@ +build +source diff --git a/docs/sphinx/Makefile b/docs/sphinx/Makefile new file mode 100644 index 00000000..20d28110 --- /dev/null +++ b/docs/sphinx/Makefile @@ -0,0 +1,216 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +.PHONY: clean +clean: + rm -rf $(BUILDDIR)/* + +.PHONY: html +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +.PHONY: dirhtml +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +.PHONY: singlehtml +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +.PHONY: pickle +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +.PHONY: json +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +.PHONY: htmlhelp +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +.PHONY: qthelp +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/S2E.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/S2E.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +.PHONY: devhelp +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/S2E" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/S2E" + @echo "# devhelp" + +.PHONY: epub +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +.PHONY: latex +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +.PHONY: latexpdf +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: latexpdfja +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: text +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +.PHONY: man +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +.PHONY: texinfo +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +.PHONY: info +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +.PHONY: gettext +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +.PHONY: changes +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +.PHONY: linkcheck +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +.PHONY: doctest +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +.PHONY: pseudoxml +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/sphinx/build.sh b/docs/sphinx/build.sh new file mode 100755 index 00000000..e7e25ffa --- /dev/null +++ b/docs/sphinx/build.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# The RST renderer in Sphinx and Github are not fully compatible with each other. +# This script makes a copy of the rst sources, patches them, and builds them with Sphinx. +# The original source should be kept as much as possible in Github's dialect so that people +# may use Github to browse the documentation. + +# This is optional +GOOGLE_ANALYTICS_ID="$1" + +# Copy files into a temp folder, then patch them to make sphinx happy +mkdir -p source +rsync -v -cr --delete ../src/* source/ +cp -rp source_templates/* source +$(cd source && sed -i 's/number-lines/linenos/g' $(find . -name '*.rst')) + +make html + +echo "Replacing .rst with .html in links" +$(cd build/html && sed -i 's/\.rst\"/\.html\"/g' $(find . -name '*.html')) + +if [ "x${GOOGLE_ANALYTICS_ID}" != "x" ]; then + echo "Customizing google analytics id" + $(cd build/html && sed -i "s/UA-XXXX-X/${GOOGLE_ANALYTICS_ID}/g" $(find . -name '*.html')) +fi + +linkchecker ./build/html/index.html diff --git a/docs/sphinx/source_templates/conf.py b/docs/sphinx/source_templates/conf.py new file mode 100644 index 00000000..b2c96912 --- /dev/null +++ b/docs/sphinx/source_templates/conf.py @@ -0,0 +1,298 @@ +# -*- coding: utf-8 -*- +# +# S2E documentation build configuration file, created by +# sphinx-quickstart on Wed Feb 21 22:30:09 2018. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'S2E' +copyright = u'2018, Cyberhaven' +author = u'Vitaly Chipounov' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = u'2.0' +# The full version, including alpha/beta/rc tags. +release = u'2.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# html_theme = 'bizstyle' +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +html_theme_options = { + 'canonical_url': '', + 'logo_only': False, + 'display_version': False, + 'prev_next_buttons_location': 'bottom', + 'style_external_links': False, + 'vcs_pageview_mode': '', + + # Toc options + 'collapse_navigation': False, + 'sticky_navigation': True, + 'navigation_depth': 4, + 'includehidden': True, + 'titles_only': False +} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (relative to this directory) to use as a favicon of +# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'S2Edoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'S2E.tex', u'S2E Documentation', + u'Vitaly Chipounov', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 's2e', u'S2E Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'S2E', u'S2E Documentation', + author, 'S2E', 'Platform for in-vivo multi-path analysis of software systems', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/docs/src/Architecture.rst b/docs/src/Architecture.rst new file mode 100644 index 00000000..3b7c167a --- /dev/null +++ b/docs/src/Architecture.rst @@ -0,0 +1,3 @@ +================ +S2E Architecture +================ diff --git a/docs/src/BuildingS2E.rst b/docs/src/BuildingS2E.rst new file mode 100644 index 00000000..aa3c5608 --- /dev/null +++ b/docs/src/BuildingS2E.rst @@ -0,0 +1,127 @@ +================================== +Building the S2E platform manually +================================== + +The simplest way to build S2E is to use the ``s2e-env`` tool. It is the preferred method for development. See `Using +s2e-env `__ for instructions. However, some build features are not exposed by ``s2e-env`` and you will have +to run them manually. + +.. note:: + + S2E builds and runs on Ubuntu 18.04 LTS (64-bit). + Earlier versions may still work, but we do not support them anymore. + + +Building using Docker +===================== + +You can create a self-contained docker image that lets you analyze any supported binary. The following command builds +the demo docker image. + +.. code-block:: console + + # Checkout S2E sources (using Google Repo) + # S2EDIR must be in your home folder (e.g., /home/user/s2e) + cd $S2EDIR + repo init -u https://github.com/s2e/manifest.git + repo sync + + # Create a build directory + mkdir build && cd build + + # Build the docker image + make -f $S2EDIR/Makefile.docker demo + +You can then run it as follows: + +.. code-block:: console + + docker run --rm -ti -w $(pwd) -v $HOME:$HOME cyberhaven/s2e-demo /demo/run.sh $(id -u) $(id -g) /demo/CADET_00001 + +This command starts the s2e-demo container and creates an S2E environment in ``$(pwd)/s2e-demo``. It then downloads a +VM image and creates a default S2E configuration suitable for running the specified binary. Once configuration is done, +the container starts S2E. + +The S2E environment in ``$(pwd)/s2e-demo`` is persistent. It is not kept in the container. You may run the container +multiple times without losing the previous settings. This is possible by mounting your home folder in the container. +The command also takes your current user and group id in order to create the environment folder with the right +permissions (docker uses root by default). + +You may specify any binary you want and are not restricted to binaries stored inside the container. You just need to +mount the folder that contains it using the ``-v`` option. + +Building S2E manually +===================== + +In addition to using the ``s2e-env`` tool, you can also build S2E manually. + +Required packages +----------------- + +You must install a few packages in order to build S2E manually. Instead of giving you a list that will get out-of-date +very quickly, we will give you some pointers that should be always up-to-date: + +- The packages for the build toolchain and S2E dependencies can be found in the S2E docker + `file `__. + +- The packages required to build the guest images can be found + `here `__. + +Checking out S2E +---------------- + +The S2E source code can be obtained from the S2E git repository using the following commands. Here ``$S2EDIR`` is the +directory that will hold both the S2E source and build directories. + +.. code-block:: console + + cd $S2EDIR + repo init -u https://github.com/s2e/manifest.git + repo sync + +This will set up the S2E repositories in ``$S2EDIR``. + +In order to contribute to S2E (e.g., submit new features or report bugs), please see `here `__. + +Building +-------- + +The S2E Makefile can be run as follows: + +.. code-block:: console + + mkdir $S2EDIR/build + cd $S2EDIR/build + make -f $S2EDIR/Makefile install + + # Go make some coffee, this will take some time (approx. 60 mins on a 4-core machine) + +By default, the ``make`` command compiles and installs S2E in release mode to ``$S2EDIR/build/opt``. To change this +location, set the ``S2E_PREFIX`` environment variable when running ``make``. + +To compile S2E in debug mode, use ``make install-debug``. + +Note that the Makefile automatically uses the maximum number of available processors in order to speed up compilation. + +Updating +-------- + +You can use the same Makefile to recompile S2E either when changing it yourself or when pulling new versions through +``repo sync``. Note that the Makefile will not automatically reconfigure the packages; for deep changes you might need +to either start from scratch by issuing ``make clean`` or to force the reconfiguration of specific modules by deleting +the corresponding files from the ``stamps`` subdirectory. + +Building the documentation +-------------------------- + +The S2E documentation is written in `reStructuredText `__ format. HTML +documentation can be built as follows: + +.. code-block:: console + + $ sudo apt-get install linkchecker + $ pip install sphinx_rtd_theme + $ cd $S2EDIR/docs/sphinx + $ ./build.sh + +The documentation will be located in ``$S2EDIR/docs/sphinx/build/html/index.html``. diff --git a/docs/src/Contribute.rst b/docs/src/Contribute.rst new file mode 100644 index 00000000..a1099e5c --- /dev/null +++ b/docs/src/Contribute.rst @@ -0,0 +1,97 @@ +=================== +Contributing to S2E +=================== + +S2E welcomes any contribution, such as fixing bugs, adding new functionality, documentation, code comments, etc. Below +are some guidelines which help make code review easier. + +All contributions to S2E must be **sent as pull requests** on GitHub. Please create an account and create a private +fork of the repository to which you would like to contribute. Patch contributions should not be posted on the mailing +list, bug tracker, posted on forums, or externally hosted and linked to. + +You must accept the *Contributor License Agreement* before your pull requests can be merged. This can be done with a +few clicks from GitHub. + +Patches must include a Signed-off-by: line + For more information see `submitting patches + `__. This is vital or we + will not be able to apply your patch! Please use your real name to sign a patch (not an alias name). + +Correct English is appreciated + If you are not sure, `codespell `__ or other programs help finding the + most common spelling mistakes in code and documentation. + +Patches should be against current git master + There's no point submitting a patch which is based on a released version of S2E because development will have moved + on from then and it probably won't even apply to master. + +Split up longer patches + Into a patch series of logical code changes. Each change should compile and execute successfully. For instance, do + not add a file to the makefile in patch one and then add the file itself in patch two. This rule is here so that + people can later use tools like `git bisect `__ without hitting points in the + commit history where S2E does not work for reasons unrelated to the bug they are chasing. + +Don't include irrelevant changes + In particular, don't include formatting, coding style or whitespace changes to bits of code that would otherwise + not be touched by the patch. It's OK though to fix coding style issues in the immediate area (few lines) of the + lines you're changing. If you think a section of code really does need a reindent or other large-scale style fix, + submit this as a separate patch which makes no semantic changes; don't put it in the same patch as your bug fix. + +Write a good commit message + S2E follows the following format:: + + module: short description + + Detailed description + + Signed-off-by: Your Name + +Ensure your code is formatted correctly + Before committing your code you **must** run `clang-format`. There is a script in the root directory, + ``run-clang-format.sh``, that can be used to format the entire S2E source tree. Alternatively you may use the + ``.clang-format`` file in the root of the repository. Note that Windows guest tools have their own code style. + + To check Python code, run the following `pylint `__-based script: + + .. code-block:: console + + $ cd $S2EDIR/s2e-env + $ . venv/bin/activate + $ pip install pylint + $ pylint -rn -j8 --rcfile=~/s2e/s2e/build-scripts/pylint_rc s2e_env + + There must be no warnings. + +Your code must be documented + If you write a plugin, please write Doxygen-style documentation in the source code as well as an ``.rst`` file that + explains how to use the plugin on some real examples. Please be as thorough as possible in the documentation. The + clearer it is, the fewer questions that will be asked. When formatting your ``.rst`` file please use a line limit + of 120 characters. + +License + Your contributions to existing code must use the license specified for that code. Each repository contains a + LICENSE file which specifies the license under which that repository is distributed. Your contribution must use + that license. If you contribute significant portions of fresh code, you may also use the MIT license for new files + that you add (copy/paste below). + + .. code-block:: cpp + + /// Copyright (c) + /// + /// Permission is hereby granted, free of charge, to any person obtaining a copy + /// of this software and associated documentation files (the "Software"), to deal + /// in the Software without restriction, including without limitation the rights + /// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + /// copies of the Software, and to permit persons to whom the Software is + /// furnished to do so, subject to the following conditions: + /// + /// The above copyright notice and this permission notice shall be included in all + /// copies or substantial portions of the Software. + /// + /// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + /// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + /// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + /// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + /// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + /// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + /// SOFTWARE. diff --git a/docs/src/DebuggingS2E.rst b/docs/src/DebuggingS2E.rst new file mode 100644 index 00000000..03e0d939 --- /dev/null +++ b/docs/src/DebuggingS2E.rst @@ -0,0 +1,107 @@ +============= +Debugging S2E +============= + +S2E is a complex aggregation of various tools totaling over 1 million LOCs. Debugging S2E may be hard. The following +types of bugs are often hard to diagnose: + +* Non-deterministic crashes +* Crashes that happen after a very long execution time +* Crashes that happen all over the place and seem to be caused by corrupted pointers + +If one of these applies to your case, you may want to read the following. The following sections are sorted from +simplest to most advanced. + +.. contents:: + +The obvious checks +------------------ + +Make sure that your code compiles without warnings. A missing return statement buried somewhere may have dramatic +consequences. Fortunately, recent versions of clang put checks in the binary in such cases (as well as for other +undefined behavior), so the program will crash sooner rather than later. + +Try to run your code with address sanitizer and/or undefined behavior sanitizer. + +Record-replay +------------- + +If your bug is non-deterministic and happens 10s of minutes to hours in your experiment, you may want to try record +replay. VMware Workstation 7 has this feature and will allow you to come as close to the bug as possible so that you can +look at it in a debugger. If you somehow miss the bug, no problem, you can replay over and over again. Unfortunately, +recent versions of VMware Workstation do not have record-replay anymore and you will need a rather old setup and +processor to use version 7. + +In general, we do not find this method very useful. + +Valgrind +-------- + +Valgrind works for small-sized VMs only (typically 1-8MB of guest RAM and very little code). It is mostly useful when +debugging the internals of the execution engine, which does not require a lot of environment to run. + +Debugging with gdb +------------------ + +S2E should be compiled with both symbol and debug information in order to use it with gdb: + +.. code-block:: console + + cd $S2EDIR/build-s2e + make -f $S2EDIR/build-s2e/Makefile all-debug + +Before starting S2E in GDB, you need a configuration script (``gdb.ini``) that sets up environment variables properly. + +.. code-block:: bash + + # GDB specific options + handle SIG38 noprint + handle SIGUSR2 noprint + set debug-file-directory /usr/lib/debug/ + set disassembly-flavor intel + set print pretty on + + # S2E configuration options + set environment S2E_CONFIG=s2e-config.lua + set environment S2E_SHARED_DIR=$S2EDIR/build-s2e/libs2e-debug/i386-s2e-softmmu/ + set environment LD_PRELOAD=$S2EDIR/build-s2e/libs2e-debug/i386-s2e-softmmu/libs2e.so + set environment S2E_UNBUFFERED_STREAM=1 + set environment S2E_MAX_PROCESSES=1 + +Then start GDB as follows: + +.. code-block:: console + + gdb --init-command=gdb.ini --args $S2EDIR/build-qemu/i386-softmmu/qemu-system-i386 \ + -drive file=/path/to/image.raw.s2e,cache=writeback,format=s2e -other -arbitary -flags + +S2E kernel debugging +-------------------- + +Sometimes you may need to debug the modified Linux kernel. To do this please refer to the following `page +`__. + +S2E debug functions +------------------- + +In order to simplify debugging a number of functions for gdb are `available +`__: + +s2e_debug_print_hex(void \*addr, int len) + Print memory (in hex) at address ``addr`` of length ``len`` + +s2e_print_constraints(void) + Print current path constraints. + +s2e_print_expr(void \*expr) + Print a symbolic expression of type ``ref``. + +s2e_print_value(void \*value) + Print an ``llvm::Value``. + +To invoke these functions use GDB's ``call`` command. For example:: + + call s2e_print_expr(¶m) + +Where ``¶m`` is the address of the expression. The output will be printed to ``debug.txt``. +Sometimes, you may need to issue the commands twice, in case the log file is not flushed. diff --git a/docs/src/DesignAndImplementation/KvmInterface.rst b/docs/src/DesignAndImplementation/KvmInterface.rst new file mode 100644 index 00000000..b478050a --- /dev/null +++ b/docs/src/DesignAndImplementation/KvmInterface.rst @@ -0,0 +1,526 @@ +===================================== +Symbolic Execution Extensions for KVM +===================================== + +This document presents extensions to the Linux KVM virtualization interface that enable building multi-path analysis +tools, such as symbolic execution engines. In a nutshell, we built ``libs2e.so``, a shared library that implements the +S2E symbolic execution engine. This library can be preloaded into a target hypervisor process. The library intercepts +and emulates calls to ``/dev/kvm`` to provide symbolic execution capabilities to vanilla hypervisors like QEMU. + +We will (1) provide an overview of how KVM works, (2) show how to build a library that emulates vanilla KVM using +a dynamic binary translator as a CPU emulation engine, (3) how to add symbolic execution capabilities on top of that, +and (4) provide a reference of the new KVM extensions so that you can integrate S2E into your own hypervisors. + + +.. contents:: + + +Kernel-based Virtual Machine (KVM) +================================== + +The Linux kernel exposes the KVM interface through the ``/dev/kvm`` file. A program (that we call a KVM client) that +wants to create a VM opens that file and sends commands to it using the ``ioctl`` interface. The most important commands +are creating and setting the state of the virtual CPU (``KVM_CREATE_VM``, ``KVM_CREATE_VCPU``, ``KVM_SET_REGS`` and its +friends), registering guest physical memory (``KVM_SET_USER_MEMORY_REGION``), starting the VM (``KVM_RUN``), and +injecting interrupts in the guest (``KVM_INTERRUPT``). A detailed list of commands is available in the Linux kernel +`documentation `__. The figure below +shows the architecture of a standard KVM setup. + +.. image:: kvm_interface.svg + :width: 75% + +It is up to the KVM client (running in user space) to emulate virtual hardware, as the KVM kernel driver only provides a +virtual CPU. The clients are responsible for handling I/O, memory-mapped I/O, injecting interrupts in the guest, and +performing DMA. During initialization, the client first allocates a chunk of virtual memory using the plain ``mmap()`` +system call, then registers this memory as guest physical memory using ``KVM_SET_USER_MEMORY_REGION``. KVM treats any +memory access that falls outside registered regions as memory-mapped I/O. + +When the guest executes an instruction that accesses unmapped physical memory, ``KVM_RUN`` returns to the client, which +determines the type of I/O access and emulates it accordingly. For example, when a guest instruction writes to physical +address ``OxB8000`` the following occurs: + +- The virtual CPU (VCPU) attempts to access that memory and realizes that it is unmapped +- The VCPU triggers a VM exit, giving control back to the KVM driver +- KVM determines the cause of the VM exit and returns to user space from the KVM_RUN call +- The client reads the faulting guest physical address and determines the associated virtual device +- The client calls the I/O handler of the VGA virtual device, which eventually displays a character in the + upper left corner of the screen. +- Once the I/O emulation is done, the client resumes the guest VM by calling ``KVM_RUN`` again. + +The KVM client injects interrupts in the guest using the ``KVM_INTERRUPT`` ioctl. Let us take the example of a virtual +clock device. This kind of device would typically trigger a periodic interrupt, e.g., every 10 ms. In order to emulate +it, the client process registers a periodic timer signal handler with the host OS. At the lowest level, the host OS +configures the host machine's clock to generate periodic host interrupts. When a host interrupt occurs, the host CPU +automatically interrupts the running guest VM, context switches to the host OS, which then delivers the timer signal to +the client process. The client then calls the virtual clock device emulation code, which then uses ``KVM_INTERRUPT`` to +inject the virtual interrupt into the guest. At the next invocation of ``KVM_RUN``, the host CPU reloads the guest +context and triggers the guest's interrupt handler. + +The KVM client also handles DMA. Remember that during initialization, the client mapped host virtual +memory in its address space, which it then instructed to be used as guest physical memory by KVM. A virtual device +handler would just read and write to that mapped host virtual memory in order to exchange data with the guest VM. +It can do so from another thread while the guest VM is running, or from the same thread when the guest VM is interrupted +by a signal and control is returned to the client. + +The above is the minimum required to run a guest such as Windows: a virtual CPU and a collection of virtual devices. KVM +implements many more features that can be optionally used. It has hypercalls, nested virtualization, various MSRs, etc. +KVM provides an interface to verify which features are available (called `capabilities`). For more details, refer to the +various KVM documentation files in the Linux `source +tree `__. You can also browse QEMU's `source +code `__ to understand which KVM features it uses. +Finally, there is a great article `here `_ that explains in detail how to write +your own KVM client from scratch. + + +Emulating the KVM interface +=========================== + +KVM is great for running code at native speeds, but what if we actually want to do some advanced analysis on the guest +code? What if we want to instrument it? One common method for doing that is to use dynamic binary translation. +The dynamic binary translator (DBT) takes a chunk of guest code, disassembles it, injects instrumentation code, +reassembles it, then runs the result on the host CPU. QEMU comes with a powerful DBT that can be used instead of KVM +to run the guest OS. One could take QEMU and modify its DBT in order to analyze guest code. +An alternative to this is to write a CPU emulator, wrap it under the KVM interface, and let QEMU use it transparently. + +Emulating the KVM interface allows decoupling the CPU emulation and code instrumentation infrastructure from virtual +hardware emulation. This is very powerful, as one is not stuck anymore with a given KVM client implementation. The +client can live on its own and upgrading it becomes straightforward. For example, the first prototype of S2E released +back in 2011 was tightly coupled to QEMU and was de facto stuck with the QEMU version of that time (v1.0). The current +version of S2E however emulates the KVM interface and is not tied to any particular client. In fact, upgrading +from QEMU 1.0 to QEMU 3.0 was fairly straightforward, despite over six years separating these two versions. + + +Using system call hooks for emulation +------------------------------------- + +There are different ways in which one can implement a KVM-compatible CPU emulator. We chose to take the DBT implemented +by QEMU and refactor it into a standalone user-space library. The advantage of this compared to building our own +emulation is that we get the maturity and performance of QEMU's DBT, with only a small overhead incurred by the +indirection of the KVM interface. Overall, it took about six person-month to refactor the code, most of it was the +tedious task of moving code around. We will explain the process in details later below. + +Our KVM CPU emulator comes as a user-space shared library that can be loaded into the KVM client using ``LD_PRELOAD``. +The library intercepts the ``open`` and ``ioctl`` system calls to ``/dev/kvm``, as well as ``mmap`` and a few others. +The ``open`` hook checks that the file name is ``/dev/kvm``, and if so, returns a fake handle that will be later checked +and intercepted by the ``ioctl`` and other system call hooks. We could also have replaced the KVM driver itself and +provided a kernel-mode version of this library, but this would have caused needless complexity and unnecessary switches +to kernel mode. + +We refer to the real KVM driver that ships with the Linux kernel as *native KVM* and to our DBT-based emulation of KVM +as *emulated KVM*, implemented by the *KVM emulation engine*. + +Differences between actual KVM and KVM emulation +------------------------------------------------ + +Implementing a KVM-compatible interface poses several challenges. In theory, there should be no difference between the +native KVM implementation and a DBT-based one, except for speed (and of course different CPU features). A proper +implementation of the emulated KVM should give the same outputs for identical inputs (CPU state, interrupts, etc.). In +practice however, due to how the DBT works, there are some differences. The most important one is the significant delay +with which the DBT handles injected guest interrupts. This may cause problems for some guest OSes. A lesser problem is +the inconsistent state of the emulated CPU when handling I/O operations. This only matters if the KVM client +reads the CPU state while handling I/O (e.g., VAPIC emulation in QEMU). + +The first difference between actual KVM and KVM emulation is the interrupt injection delay. The virtualization hardware +on the host CPU triggers a VM exit as soon as there is an external interrupt sent to the host (timer, network, etc). It +also triggers a VM exit as soon as the guest unmasks interrupts (e.g., by writing to APIC registers or executing the +``sti`` instruction) and there are pending interrupts (injected by the client with ``KVM_INTERRUPT``). All this happens +without delays at instruction granularity. In contrast, emulated KVM is much slower to react to these events. In the +worst case, the delays may starve lower priority interrupts, causing hangs. Some guests may even crash if interrupts +come too late (e.g., there is a timer DPC in the Windows XP kernel that is wrongly allocated on the stack, which causes +a crash if the interrupt happens too late, i.e., after the stack is cleaned). + +For performance reasons, the DBT cannot check interrupts at every instruction. Instead, it checks them at control flow +change boundaries, i.e., when there is an instruction that modifies the program counter. When the DBT enables +translation block chaining (a technique that speeds up emulation by running translated code continuously without calling +the DBT), pending interrupts are not checked at all and it is up to the KVM client to break the translation block chain +when there is a pending interrupt. Unfortunately, native KVM does not provide a standard API for that and the most +reliable way we found to handle this is to add an additional ``KVM_FORCE_EXIT`` call which the client would invoke +when there are pending interrupts. + +The second difference is the imprecise state on device I/O. When native KVM returns from ``KVM_RUN`` because the +guest executed an I/O instruction, the guest CPU's program counter points to the next instruction. In emulated KVM, +however, the program counter can point to some previous instruction close by. This is because the DBT does +not update the program counter after each instruction, for performance reasons. Instead, the DBT updates it at the next +control flow change (i.e., when the guest explicitly sets the program counter), or when there is an exception. + +This is not a problem unless the KVM client reads the CPU state when handling I/O. On QEMU, this seems to only matter +for VAPIC emulation. OSes like Windows heavily read and write the APIC's Task Priority Register (TPR). This may trigger +an excessive amount of CPU exits and kernel-user mode switches, slowing down the guest considerably. To solve this, QEMU +patches the guest to replace the I/O instruction that accesses the TPR with a call to BIOS code that emulates the APIC's +TPR without causing a VM exit. To do this patching, QEMU checks the instruction pattern at the program counter that +accessed the `VAPIC `__. If this program counter is wrong +(like in emulated KVM), patching will fail. We extended the KVM interface with the ``KVM_CAP_DBT`` flag to disable the +VAPIC when emulated KVM is present. Disabling it does not cause noticeable slowdowns because there are no kernel-user +mode switches involved anyway. + +Summary +------- + +To summarize, we implemented a shared library that hooks KVM calls in order to emulate the KVM interface. The library +uses DBT-based CPU emulation. In order to accommodate for shortcomings of the DBT-based method, we added two extensions +to KVM: ``KVM_CAP_DBT`` and ``KVM_FORCE_EXIT``. The first is a capability that signals to the KVM client the presence of +a DBT-based implementation so that it can adjust its behavior accordingly. The second allows faster interrupt injection. +We do not believe that these two extensions are fundamental, they could probably be eliminated with a better engineering +of the CPU emulator. + + + +Adding symbolic execution capabilities to KVM +============================================= + +In the previous section, we have seen how to build a KVM emulation engine out of a DBT that only supports one execution +path and no symbolic data. In this section, we will show how to extend that engine as well as the KVM interface in order +to support symbolic execution. We will primarily focus on the KVM interface, treating the symbolic execution engine +itself as a black box. The design and implementation of the symbolic execution engine will be covered in another write +up. + +Before we begin, let us recap how symbolic execution works. Programs take inputs, perform some computations on them, and +generate some output. If there is a conditional branch, such as ``if (x + 2) ... else ...``, the predicate is evaluated +and one or the other branch is executed. During normal execution (e.g., when running on a normal CPU), all inputs have a +concrete value (e.g., ``x=1`` or ``x=12``) and exercise only one path at a time on each run. Symbolic execution replaces +the concrete inputs with symbols (e.g., ``x=λ``) and builds symbolic expressions (e.g., ``λ + 2``) as the program +executes. When a symbolic expression reaches a conditional branch, the engine calls a constraint solver to determine +which branch to follow. In case both outcomes are feasible, the engine splits the current execution path in two by +taking a snapshot of the system state (CPU, RAM, devices) and then executes each path independently. Each path also gets +a constraint (e.g., ``λ + 2 != 0`` and ``λ + 2 == 0``) so that the constraint solver can remember where execution came +from and compute concrete outputs when execution terminates. + +A symbolic execution engine can be decomposed in two main components: one that enables multi-path execution and another +one that handles symbolic data storage and propagation. Hypervisors such as QEMU of VMware already let users take as +many snapshots as they want. These snapshots include CPU, memory, as well as device state. Multi-path execution requires +the ability to quickly create lightweight whole-system snapshots and be able to switch between them at any time. On top +of that, a symbolic execution engine adds the ability to store symbolic data in the snapshots and perform computations +on that symbolic data. + +Multi-path execution +-------------------- + +The hypervisor needs to be aware of multi-path execution. A vanilla hypervisor normally runs a single path at a time +and all guest memory accesses go to a fixed area of host virtual memory, all disk accesses go to the same file, etc. +In multi-path mode, however, it is necessary to redirect these addresses to *per-path* storage. In other words, each +execution path would have its own area of virtual memory, disk storage, and even device state. Furthermore, this must +be done efficiently using copy-on-write, as each path can have several gigabytes of state. + +One approach to solve this is to add several extensions to the KVM interface. The extensions include a call to +read/write memory, a call to read/write the virtual disk, and a callback to save and restore device state. The purpose +of these calls is to redirect disk or DMA accesses done by the hypervisor's virtual devices to per-state storage. This +level of indirection allows keeping the KVM emulation engine decoupled from the hypervisor, which does not need to be +aware of the mechanics of how snapshots are stored, how copy-on-write is implemented, etc. This is all done by the +symbolic execution engine. We will present next each call individually. + +The first extension lets the hypervisor specify memory regions that must be saved in the system snapshot. During +initialization, immediately after the hypervisor maps guest physical memory, it must now invoke the +``KVM_CAP_MEM_FIXED_REGION`` API, specifying the host virtual address and the size of the allocated region. The KVM +emulation engine uses this information to initialize per-state storage for that memory region, copy any data from the +original mapped region, then forbid access to that region. The hypervisor cannot dereference the original memory +anymore and must instead call ``KVM_CAP_MEM_RW``, which we will introduce next. + +The second extension implements memory accesses. When the hypervisor needs to access guest physical memory (e.g., when +performing DMA), instead of directly dereferencing a pointer to that memory, it must now invoke the ``KVM_CAP_MEM_RW`` +API. This call takes as parameters a source and destination pointer, the direction of the transfer (read/write), and +the length. The symbolic execution engine uses this information to lookup the actual per-state data associated with the +given host virtual address and returns (or writes) the requested data. + +Finally, a few extensions are needed to manage the disk and device state. Instead of accessing the virtual disk file +using read or write system calls, the hypervisor must now call ``KVM_DISK_RW``. Handling device state is a bit +different: instead of intercepting reads/and writes to every byte of the device state (which would be completely +impractical), the symbolic execution engine leverages the hypervisor's ability to save and restore device state to/from +a file. However, instead of using a file, the hypervisor calls the ``KVM_DEV_SNAPSHOT`` API. This call is only required +when forking or switching to a new execution path. You can find more details about these APIs in the reference below. + +.. note:: + + You may be wondering if these multi-path extensions are necessary. The short answer is no. If we can find a + system-level approach to managing the state (vs. manually inserting indirections in the code), then we do not need + them anymore. For example, it is possible to use the ``fork()`` system call of the host in order to create a new + execution path (but this is prohibitively expensive, as there would be one hypervisor process per path), or + implement lightweight system snapshots by tweaking the page tables of the host (see `Dune + `__ [OSDI'12] and `Hummingbird + `__ [HOTOS'13]). We plan to port S2E to the latter + approach, which would bring many more benefits besides simplified APIs (e.g., much faster state snapshotting and + state switching). + + +Handling symbolic data +---------------------- + +To keep things simple, we decided that symbolic data cannot leak into the KVM client and therefore the KVM API does not +need support for symbolic data exchange. We observed that symbolic data does not usually propagate through this +interface: QEMU does not normally read CPU registers or memory locations that contain symbolic data. Likewise, data +exchanged between the guest and the virtual devices is concrete. In cases where symbolic data does leak, the KVM +emulation engine concretizes it. Here is what happens when a program tries to print a string containing symbolic +characters: + + * The program running in the guest calls ``printf("%s", buf);`` where ``buf`` has one or more symbolic characters. + * ``printf`` formats the string into a temporary buffer (which now has symbolic characters too), then issues + a ``write`` system call with the address of that temporary buffer and the file descriptor of the console as + a parameter. + * The kernel forwards the ``write`` request to the console driver. + * The console driver writes each character to the video memory. + * The KVM emulation engine determines that the write requires a VM exit to the hypervisor because the address points + to a memory-mapped I/O region. The engine also checks whether the instruction has symbolic data in its data operand + and if yes, concretizes the symbolic data before calling the hypervisor, which sees the concrete value. + Concretization adds a path constraint to ensure correct symbolic execution when control is passed back to + the program. + +Restricting the KVM interface to concrete data brings massive simplifications to the system. There is no need to rewrite +a potentially large and complex hypervisor to support symbolic data. And in practice, simply redirecting the program's +output to ``/dev/null`` or a symbolic file in a RAM disk is enough to work around most concretizations issues (e.g., +when symbolic data is written to the console or the virtual disk). Of course, one may want to symbolically execute +virtual devices (e.g., when testing device drivers). The solution for this is to write a symbolic device model, which we +leave out for another tutorial. + + +Reference +========= + +This section explains in detail the new KVM extensions that a KVM client should support in order to be compatible +with the KVM emulation engine. Each command is described as follows: + +* Command: indicates the name of the command. +* Capability: indicates the KVM capability that signals the presence of that command. +* Requirement: indicates when that capability/command must be supported. Some commands are only required for multi-path + execution, some are required in all cases. +* Any associated data structures. These are passed along the command identifier to the ``ioctl`` system call. +* The command description. + +.. note:: + + Here is a pointer to S2E's source code where you can find the implementation of all these extensions. + `libs2e.c `__ is the main entry point of the + ``libs2e.so`` shared library. This module intercepts IOCTLs to ``/dev/kvm`` and forwards them to the appropriate + handlers. If you are lost in the 90 KLOC that comprise ``libs2e.so``, just start from this file and work your + way up to the other components. This should help you get started hacking! + + +Dynamic binary translation +-------------------------- + +=========== ===================================================== +Command N/A +Capability KVM_CAP_DBT +Requirement Mandatory for any KVM emulation engine that uses DBT +=========== ===================================================== + +This capability indicates to the client that the underlying KVM implementation uses dynamic binary translation instead +of actual hardware virtualization. Until the KVM emulation engine perfectly mimics the native KVM interface, this +capability allows the client to adjust its behavior to support the KVM emulation engine. + + +Registering memory regions +-------------------------- + +=========== ======================================================================== +Command KVM_MEM_REGISTER_FIXED_REGION +Capability KVM_CAP_MEM_FIXED_REGION +Requirement - Mandatory for a KVM emulation engine that supports multi-path execution + - Optional for single-path implementations +=========== ======================================================================== + +.. code-block:: c + + struct kvm_fixed_region { + const char *name; + __u64 host_address; + __u64 size; + + #define KVM_MEM_SHARED_CONCRETE 1 + __u32 flags; + }; + +The KVM client must call this API after it allocates guest physical memory (either RAM or ROM) in order to register them +with the KVM emulation engine. The client must register all memory regions before calling ``KVM_RUN``. The client must +not later pass to ``KVM_SET_USER_MEMORY_REGION`` any region (or part thereof) that has not been previously registered +with ``KVM_MEM_REGISTER_FIXED_REGION``. + +This API lets the KVM emulation engine register internal data structures that will track later accesses done with +``KVM_MEM_RW``. After this API return, the memory chunk specified by ``host_address`` and ``size`` becomes read and +write-protected. The client must not access it directly anymore and must always use ``KVM_MEM_RW`` instead. Protecting +the region is helpful to catch any stray accesses and help with debugging. + +The ``KVM_MEM_SHARED_CONCRETE`` flag specifies whether the given memory chunk may be shared among all execution paths. +This is useful for video memory, which is typically write-only and whose state does not matter for correct guest +execution (i.e., different execution paths clobbering each other's frame buffers has usually no bad effect on +execution correctness as long as guest code does not read that data back). + + +Accessing guest memory +---------------------- + +=========== ======================================================================== +Command KVM_MEM_RW +Capability KVM_CAP_MEM_RW +Requirement Mandatory for all KVM emulation engine implementations +=========== ======================================================================== + +.. code-block:: c + + struct kvm_mem_rw { + /* source and dest are always host pointers */ + __u64 source; + __u64 dest; + __u64 is_write; + __u64 length; + }; + +This capability signals to the KVM client that the KVM emulation engine requires the KVM client to perform all accesses +to physical memory through the ``KVM_CAP_MEM_RW API``. For single-path emulators, this is required to properly flush +CPU's code cache in case DMA touches memory that contains code. For multi-path emulators, this also ensures that data is +read/written from/to the correct execution state. + + +Interrupting execution +---------------------- + +=========== ======================================================================== +Command KVM_FORCE_EXIT +Capability KVM_CAP_FORCE_EXIT +Requirement Mandatory for KVM emulation engine implementations that cannot respond quickly to interrupt injection +=========== ======================================================================== + +This capability signals to the KVM client that the KVM emulation engine cannot return from KVM_RUN quickly enough +(e.g., when there are signals present). A KVM client must call ``KVM_FORCE_EXIT`` when it would otherwise want +KVM_RUN to exit and when ``KVM_CAP_FORCE_EXIT`` is present. + + +Virtual disk I/O +---------------- + +=========== ======================================================================== +Command KVM_DISK_RW +Capability KVM_CAP_DISK_RW +Requirement - Mandatory for a KVM emulation engine that supports multi-path execution + - Optional for single-path implementations or when the client does not support virtual disks +=========== ======================================================================== + +.. code-block:: c + + struct kvm_disk_rw { + /* Address of the buffer in host memory */ + __u64 host_address; + /* 512-byte sectors */ + __u64 sector; + /* input: sectors to read/write, output: sectors read/written */ + __u32 count; + __u8 is_write; + }; + + +The KVM client must invoke this command when it otherwise would write disk data to a file. The KVM emulation engine +takes the disk data specified in the ``kvm_disk_rw`` structure and store it in a location that is associated with the +current execution path. If the client fails to invoke this command while in multi-path execution, the disk state would +be shared by all execution paths, leading to virtual disk corruption, as the different paths would clobber each other's +disk data. + +In practice, KVM clients should implement copy-on-write mechanisms. In case of reads, the client must call first +``KVM_DISK_RW`` to get any dirty sectors, and if there are none, read from the underlying image file. In case of writes, +the client should directly call ``KVM_DISK_RW`` with the modified sector data. + + +Saving/restoring device snapshots +--------------------------------- + +=========== ======================================================================== +Command KVM_DEV_SNAPSHOT +Capability KVM_CAP_DEV_SNAPSHOT +Requirement - Mandatory for a KVM emulation engine that supports multi-path execution + - Optional for single-path implementations +=========== ======================================================================== + +.. code-block:: c + + struct kvm_dev_snapshot { + __u64 buffer; + /* If is_write == 0, indicates expected size in case of error */ + __u32 size; + + /* Only when is_write == 0, indicates the position from which reading the state */ + __u32 pos; + __u8 is_write; + }; + +This command should only be called when KVM_RUN returns the ``KVM_EXIT_SAVE_DEV_STATE`` or +``KVM_EXIT_RESTORE_DEV_STATE`` exit code. + +When saving a device snapshot (``is_write = 1``), only ``buffer`` and ``size`` are valid. ``buffer`` must point to a host +virtual address containing the state of all virtual devices. The KVM client must call ``KVM_DEV_SNAPSHOT`` only once. +The call returns the number of bytes written, which should be equal to ``size``. + +When restoring a device snapshot (``is_write = 0``), the commands allows reading any range of snapshot data previously +saved. ``pos`` and ``size`` must be set to read the desired chunk of data. The KVM client must call ``KVM_DEV_SNAPSHOT`` +multiple times. The call returns the number of bytes effectively read, which may be smaller than ``size`` in case +the specified range exceeds the amount of data in the snapshot. + + +Setting the clock scale pointer +------------------------------- + +=========== ======================================================================== +Command KVM_SET_CLOCK_SCALE +Capability KVM_CAP_CPU_CLOCK_SCALE +Requirement - Mandatory when the overhead of the KVM emulation engine is large + - Optional otherwise +=========== ======================================================================== + +This command communicates to the KVM emulation engine the address of a variable that contains the clock scale. +The address must be in the KVM client's address space. +The KVM client must honor this factor as soon as possible, typically the next time a virtual device calls a time-related +function (e.g., to schedule a timer interrupt). + +The clock scale is an integer that specifies by what factor the client must slow down the guest's virtual clock. +A factor of one indicates no slow down (real-time). A factor of two indicates that the client must run its clock +two times slower than real-time. In other words, for every second of elapsed time seen by the guest, the wall time +would have advanced by two seconds. + +The KVM emulation engine sets the clock scale when it performs slow operations, e.g., interpreting LLVM instructions +in the symbolic execution engine. This may be several orders of magnitude slower than real-time (100-1000x clock +scale factor). Failing to set the factor accordingly would cause the client to inject timer interrupts too +frequently, preventing any progress of the guest. + + +KVM_RUN exit codes +------------------ + +When the KVM_RUN command exits, it indicates to the KVM client the reason of the exit in the form of an exit code. In +addition to the standard codes, the KVM emulation engine adds the following exit codes. They should be implemented by +any client that supports multi-path execution. + +``KVM_EXIT_FLUSH_DISK`` + + This exit code indicates to the client that it must flush any buffers associated with virtual disks. + The client should call ``KVM_DISK_RW`` in order to flush any in-progress transfers before invoking ``KVM_RUN`` again. + + The KVM emulation engine returns this code when it is ready to fork a new execution path or in any other case where + it needs the disk state to be consistent. + + Implementing this code is optional if the client does not support virtual disks. + + +``KVM_EXIT_SAVE_DEV_STATE`` + + This exit code indicates to the client that it must take a snapshot of all virtual devices and send the + snapshot data to the KVM emulation engine using the ``KVM_DEV_SNAPSHOT`` command. + + The KVM emulation engine returns this code when it is ready to fork a new execution path or wants to switch to + another execution path. In either case, it needs the virtual device state to be committed to the per-state storage + before continuing. + + +``KVM_EXIT_RESTORE_DEV_STATE`` + + This exit code indicates to the client that it must restore a snapshot of all virtual devices after reading + the snapshot data from the KVM emulation engine by using the ``KVM_DEV_SNAPSHOT`` command. + + The KVM emulation engine returns this code when it wants to switch to another execution path and needs the client + to restore the associated virtual device state. + + +``KVM_EXIT_CLONE_PROCESS`` + + This exit code indicates to the KVM client that it must re-initialize the state of all its threads. + + The KVM emulation engine returns this code after it calls the ``fork()`` system call in order to create a new + instance of the emulator. In this new instance, there is only one thread (the one that called ``fork()``). The + client must ensure that before calling KVM_RUN again, the new process instance is completely independent from the + parent one and can run on its own. In particular, the client must close and re-open any file descriptors that + ``fork()`` would otherwise share with the parent. diff --git a/docs/src/DesignAndImplementation/kvm_interface.odg b/docs/src/DesignAndImplementation/kvm_interface.odg new file mode 100644 index 00000000..2115c4ab Binary files /dev/null and b/docs/src/DesignAndImplementation/kvm_interface.odg differ diff --git a/docs/src/DesignAndImplementation/kvm_interface.svg b/docs/src/DesignAndImplementation/kvm_interface.svg new file mode 100644 index 00000000..539e75e2 --- /dev/null +++ b/docs/src/DesignAndImplementation/kvm_interface.svg @@ -0,0 +1,1207 @@ + + + +image/svg+xmlKVM Client +(e.g., QEMU) +Virtual Hardware +KVM Interface +Guest OS +KVM Driver +/dev/kvm +VM initialization +Interrupt injection +KVM_RUN +... +Device I/O +Guest faults +Guest CPU idle +... +User space +Kernel space +ioctl +Hardware Virtualization + \ No newline at end of file diff --git a/docs/src/EquivalenceTesting.rst b/docs/src/EquivalenceTesting.rst new file mode 100644 index 00000000..be440849 --- /dev/null +++ b/docs/src/EquivalenceTesting.rst @@ -0,0 +1,134 @@ +=================== +Equivalence Testing +=================== + +Given two functions ``f`` and ``g``, we would like to check whether for all input, they produce the same output. In +other words, we verify whether ``f`` and ``g`` are equivalent. + +Equivalence testing is useful to check that two implementations of the same algorithm produce identical results. E.g., +scripts may parse the output of some tools (e.g., ls) and thus expect certain behavior. The developers of alternative +and compatible implementations (e.g., ls for Busybox) of such tools can use equivalence testing to gain confidence that +their implementation behaves in the same way. Another example is to check the equivalence of two implementations of the +same protocol (e.g., TCP/IP). + +In this tutorial, we show how to perform equivalence testing with S2E. At a high level, this is done by passing +identical symbolic inputs to the functions, letting S2E explore the execution tree, and when both functions return, +check whether they produced the same output (e.g., with an assertion check). + +Program to Test +--------------- + +This sample program contains two implementations of the factorial algorithm that we want to test for equivalence. Both +implementations behave in the same way except in some corner cases. + +.. code-block:: c + + #include + #include + + /** + * Computes x! + * If x > max, computes max! + */ + uint64_t factorial1(uint64_t x, uint64_t max) { + uint64_t ret = 1; + for (uint64_t i = 1; i<=x && i<=max; ++i) { + ret = ret * i; + } + return ret; + } + + /** + * Computes x! + * If x > max, computes max! + */ + uint64_t factorial2(uint64_t x, uint64_t max) { + if (x > max) { + return max; + } + + if (x == 1) { + return x; + } + return x * factorial2(x-1, max); + } + + int main() { + uint64_t x; + uint64_t max = 10; + + //Make x symbolic + s2e_make_symbolic(&x, sizeof(x), "x"); + + uint64_t f1 = factorial1(x, max); + uint64_t f2 = factorial2(x, max); + + //Check the equivalence of the two functions for each path + s2e_assert(f1 == f2); + + //In case of success, terminate the state with the + //appropriate message + s2e_kill_state(0, "Success"); + return 0; + } + +Compile it as follows after adapting the include path: + +.. code-block:: console + + gcc -I /path/to/s2e/guest/include/ -std=c99 -o factorial factorial.c + +Configuring S2E +--------------- + +We will instruct S2E to compute test cases at the end of each execution path (i.e., when ``s2e_kill_state`` is called), +in order to reproduce potential assertion failures on our own. A test case consists of concrete program inputs that +would drive the program down the corresponding execution path. + +.. code-block:: lua + + -- File: config.lua + s2e = { + kleeArgs = { + --Switch states only when the current one terminates + "--use-dfs-search" + } + } + plugins = { + -- Enable S2E custom opcodes + "BaseInstructions", + + -- Basic tracing required for test case generation + "ExecutionTracer", + + -- Enable the test case generator plugin + "TestCaseGenerator", + } + +Running the Program in S2E +-------------------------- + +Run the program in S2E. Refer to `this tuorial `__ for more details. S2E will +exit when all paths terminate. + +Make sure to have at least 8 GB of available virtual memory and set the stack size to unlimited using ``ulimit -s +unlimited``. + +Interpreting the Results +------------------------ + +After the run, the ``s2e-last/messages.txt`` file contains the following output: + +* Messages explaining the reason why each state terminated (either success or failure) +* The concrete input that would allow replaying the same path independently of S2E + +For several states, we see the following type of message:: + + message: "Assertion failed: f1 == f2" + TestCaseGenerator: processTestCase of state 0 at address 0x8048525 + x: 7f 7f 7f 7f 7f 7f 7f 7f + +This indicates that when ``x == 0x7f7f7f7f7f7f7f7f``, the two implementations of factorial produce a different output. +To reproduce this behavior, take the computed value for x (it is displayed in little endian format by the test case +generator), plug it into the original program, and run the program in a debugger to understand what happens. When you +fixed the deviating behavior, rerun the program again in S2E, until all states terminate with a success. diff --git a/docs/src/FAQ.rst b/docs/src/FAQ.rst new file mode 100644 index 00000000..e8dd9e8d --- /dev/null +++ b/docs/src/FAQ.rst @@ -0,0 +1,144 @@ +================================ +Frequently Asked Questions (FAQ) +================================ + +How do I know what S2E is doing? +================================ + +1. Enable execution tracing and use the fork profiler to identify the code locations that fork the most. In your + configuration file, enable the ``ExecutionTracer``, ``ModuleTracer`` and the ``ModuleExecutionDetector`` plugins. + This will allow you to collect all fork locations. Additionally, you can use ``TranslationBlockTracer`` in order to + have a detailed trace for each execution path, which you can view with the ``tbtrace`` tool. Finally, + ``TranslationBlockCoverage`` allows you to view the basic block coverage in either IDA Pro or Radare (as described + in the `Coverage `__ tutorial). + +2. Look at ``s2e-last/debug.txt`` and other files. These files list all the major events occurring during symbolic + execution. If there are no messages in the logs and the CPU usage is 100%, it may be that execution is stuck + in the constraint solver. Use the `perf top` Linux utility to get a call stack to identify which part of S2E is busy. + +3. ``run.stats`` contains many types of statistics. S2E updates this file about every second, when executing symbolic + code. See later in this FAQ for a description of its fields. + +Execution seems stuck/slow. What to do? +======================================= + +First, ensure that you configured S2E properly. The ``s2e-env`` does its best to have a minimal default configuration +that works in most cases. + +* By default, S2E flushes the translation block cache on every state switch. S2E does not implement copy-on-write for + this cache, therefore it must flush the cache to ensure correct execution. Flushing avoids clobbering in case there + are two paths that execute different pieces of code loaded at the same memory locations. Flushing is **very** + expensive in case of frequent state switches. In most of the cases, flushing is not necessary, e.g., if you execute a + program that does not use self-modifying code or frequently loads/unloads libraries. In this case, use the + ``--flush-tbs-on-state-switch=false`` option. + +* Make sure your VM image is minimal for the components you want to test. ``s2e-env`` generates working Linux images, + but if you created it manually, make sure it follows some basic guidelines. In most cases, it should not have swap + enabled and all unnecessary background daemons should be disabled. Refer to the `image installation + `__ tutorial for more information. + +Second, throw hardware at your problem + +* Refer to the `How to run S2E on multiple cores `__ tutorial for instructions. + +Third, use S2E to *selectively* relax and/or over-constrain path constraints. + +* First, run the `fork profiler `__ to understand which program counters in your program fork + the most. Usually, functions such as ``printf`` can be stubbed out, and others like ``strlen`` can be modeled + to decrease drastically the number of forks. `s2e.so `__ comes with models for + several such functions. You can also give `state merging `__ a try. + +* Understanding what to select can be made considerably easier if you `attach a debugger `__ to + the S2E instance. + +* If you use a depth-first search and execution hits a polling loop, rapid forking may occur and execution may never + exit the loop. Moreover, depending on the accumulated constraints, each iteration may be slower and slower. Make sure + you use concolic execution, depth first search, and a test input to make sure at least the first path terminates. + Then, try to use a different search strategy or kill unwanted execution paths. Plugins such as ``EdgeKiller`` may + help you kill unwanted loop back edges. + +* Try to relax path constraints. For example, there may be a branch that causes a bottleneck because of a complex + constraint. You could try to stub out the function in which that branch is contained. For example, if it's a CRC + check that gets stuck in the solver, you may want to replace the CRC function with a stub that returns an unconstrained + symbolic value. This trades execution consistency for execution speed. Unconstraining execution may create paths + that cannot occur in real executions (i.e., false positives), but as long as there are few of them, or you can detect + them a posteriori, this is an acceptable trade-off. + +How do I deal with path explosion? +================================== + +Use S2E to *selectively* kill paths that are not interesting and prevent forking outside modules of interest. The +following describes concrete steps that allowed us to explore programs most efficiently. + +1. Run your program with minimum symbolic input (e.g., 1 byte) and with tracing enabled (see first section). + +2. Insert more and more symbolic values until path explosion occurs (i.e., it takes too long for you to explore all the + paths or it takes too much memory/CPU resources). + +3. Extract the fork profile and identify the code locations that fork the most. + +4. If forking occurs outside the module of interest, the following may help: + + * Concretize some symbolic values when execution leaves the module of interest. You may need to use the + ``FunctionMonitor`` plugin to track function calls and concretize parameters. + * Provide example values to library functions (e.g., to ``printf``, as described previously) + * Minimize amount of symbolic data leakage into the kernel (e.g., symbolic file names or symbolic stdio). Redirect + output to ``/dev/null``. + * For dynamically-linked Linux binaries, enable the `FunctionModels `__ plugin to + return a symbolic expression rather than forking in common libc functions. + +5. Kill the paths that you are not interested in: + + * You may only want to explore error-free paths. For example, kill all those where library functions fail. + * You may only be interested in error recovery code. In this case, kill all the paths in which no errors occur. + * Write a custom plugin that probes the program's state to decide when to kill the path. + * If you exercise multiple entry points of a library (e.g., a device driver), it may make sense to choose only one + successful path when an entry point exits and kill all the others. + * Kill back-edges of polling loops using the `EdgeKiller `__ plugin. You can also use this + plugin when execution enters some block of code (e.g., error recovery). + +6. Prioritize paths according to a metric that makes sense for your problem. This may be done by writing a custom state + searcher plugin. S2E comes with several examples of searchers that aim to maximize code coverage as fast as + possible. + +How to keep memory usage low? +============================= + +You can use several options, depending on your needs. + +* Disable forking when a memory limit is reached using the ``ResourceMonitor`` plugin. + +* Explicitly kill unneeded paths. For example, if you want to achieve high code coverage and know that some path is + unlikely to cover any new code, kill it. + +How much time is the constraint solver taking to solve constraints? +=================================================================== + +Enable logging for constraint solving queries: + +.. code-block:: lua + + s2e = { + kleeArgs = { + "--use-query-log", + "--use-query-pc-log", + } + } + +With this configuration S2E generates two logs: ``s2e-last/queries.pc`` and ``s2e-last/solver-queries.qlog``. Look for +"Elapsed time" in the logs. + +What do the various fields in ``run.stats`` mean? +================================================= + +You can open ``run.stats`` in a spreadsheet as a CSV file. Most of the fields are self-explanatory. Here are the +trickiest ones: + +* ``QueryTime`` shows how much time KLEE spent in the solver. +* ``CexCacheTime`` adds to that time also the time spent while looking for a solution in a counter-example cache (which + is enabled by the ``--use-cex-cache`` KLEE option). ``SolverTime`` shows how much time KLEE spent in total while + solving queries (this includes all the solver optimizations that could be enabled by various solver-related KLEE + options). +* ``ResolveTime`` represents time that KLEE spent resolving symbolic memory addresses, however in S2E this is not + computed correctly yet. +* ``ForkTime`` shows how much time KLEE spent on forking states. diff --git a/docs/src/Howtos/Concolic.rst b/docs/src/Howtos/Concolic.rst new file mode 100644 index 00000000..983ea213 --- /dev/null +++ b/docs/src/Howtos/Concolic.rst @@ -0,0 +1,187 @@ +================================================= +Analyzing large programs using concolic execution +================================================= + +Symbolic execution may get stuck at the start of an execution and have a hard time reaching deep paths. +This is caused by path selection heuristics and the constraint solver. Path selection heuristics may not know +well which execution paths to choose so that execution can go deeper. For example, in a loop that depends on a +symbolic condition, the heuristics may blindly keep selecting paths that would never exit the loop. Even if the path +selection heuristics knew which path to select to go through the whole program, invoking the constraint solver on every +branch that depends on symbolic input may become increasingly slower with larger and larger path depths. + +To alleviate this, S2E also implements a form of *concolic execution*. It works like traditional symbolic execution: it +propagates symbolic inputs through the system, allowing conditional branches to fork new paths whenever necessary. In +addition to this, concolic execution augments these symbolic values with *concrete* values (hence the term *concolic*). +The concrete values give a hint to the execution engine about which path to follow first. These values can be viewed as +a *seed*. In practice, the S2E user launches the program with symbolic arguments augmented with concrete seeds that +would drive the program down the path that reaches interesting parts of that program, while forking side branches, which +S2E would then thoroughly explore once the main seed path is completed. + +Concolic execution allows the program under analysis to run to completion without getting lost in the state space, +while exploring additional paths along the seed path. On each branch that depends on a symbolic value, the +engine follows in priority the one that would have been followed had the program been executed with the concrete seed. +When the first path that corresponds to the initial concrete values terminates, the engine picks another path (with +the help of a search heuristic), recomputes a new set of concrete values, and proceeds similarly until this second path +terminates. Plugins that implement custom search heuristics can optimize path selection for different needs +(high code coverage, bug finding, etc.). + + +.. note:: + + You may be familiar with a different form of concolic execution initially introduced by the DART paper (PLDI 2005). + DART runs programs *concretely* along one path while collecting path constraints. When the program + terminates, DART takes one constraint, negates it, computes an alternate set of concrete input values, then re-runs + the program again using these new concrete inputs in order to exercise the new path. + + Unlike symbolic execution (and S2E), DART runs the program entirely concretely and does not + fork the state of the program. In contrast, symbolic execution (and S2E) interprets instructions symbolically and + builds an execution tree as a result of forking states when encountering a branch instruction that depends on + symbolic data. + + In the context of S2E, *concolic execution* refers to normal symbolic execution where symbolic values are + augmented with concrete data. Symbolic execution merely uses the concrete inputs as a seed to guide path exploration + along specific paths of the execution tree. + + +Executing programs in concolic mode +=================================== + +Concolic execution is the default mode of operation of S2E. You do not need to do anything special other than +specifying initial concrete data (*seeds*) when creating symbolic values. + +At the lowest level, one can create symbolic values with ``s2e_make_symbolic()``. This API writes the specified number +of symbolic bytes to the given buffer. However, before doing that, it saves into an internal cache the existing concrete +data stored in the buffer. The cache maps the symbolic values to the given concrete values and allows the substitution +of symbolic inputs with the concrete ones during expression evaluation (e.g., at fork points). + +You can also create symbolic files, symbolic program arguments, symbolic standard input, etc. All these methods +eventually call ``s2e_make_symbolic()``. You may find more information about this +`here <../Tutorials/BasicLinuxSymbex/s2e.so.rst>`__ and `here <../Tutorials/BasicLinuxSymbex/SourceCode.rst>`__. + + +FAQ +=== + +**How is concolic execution as implemented in S2E similar to / different from symbolic execution as implemented +in KLEE?** + +Concolic execution builds upon symbolic execution. They both require symbolic inputs, they both interpret instructions +symbolically, build expressions, and both require a constraint solver to be called on a branch that depend on symbolic +expressions in order to determine the feasibility of the outcomes. + +When a branch depends on symbolic data, symbolic execution does not know which branch outcome is feasible without +calling the solver. Concolic execution, however, can use concrete inputs associated with the symbolic value to determine +the feasibility of at least one branch outcome. In theory, concolic execution will not get stuck if it cannot determine +the feasibility of the alternate branch. It would simply discard the alternate path. + +**Can I disable concolic execution in S2E and just use normal symbolic execution like in KLEE?** + +No. In practice, concolic execution as implemented in S2E behaves like symbolic execution. Just like symbolic +execution, concolic execution calls the constraint solver on each branch that has a symbolic condition. +In addition to that, concolic execution makes sure that the path constraints of the current state satisfy +the initial concrete inputs. You may set concrete inputs to zero (i.e., pass a zero-filled buffer to +``s2e_make_symbolic()``) if you do not care about them. This will not have any effect on the state space of the program. +Given enough resources, both approaches will discover the same states. + +**If concolic execution uses concrete data to guide exploration, does it still need a constraint solver?** + +Yes. Concolic execution does not know if the other outcome of a branch is feasible or not. It must therefore +use a constraint solver on each branch that depends on symbolic input in order to determine the feasibility +of the alternate branch outcome. + +**Does concolic execution call the constraint solver when a path terminates in order to compute concrete inputs?** + +No. Concrete inputs for a state are fully known when the state is created or forked. The symbolic execution engine +ensures that all new constraints added to the state satisfy these initial inputs. Any new symbolic values created +in the state during execution automatically get corresponding concrete values as well. Overall, this removes a call to +the solver compared to vanilla symbolic execution. + +**The test case generator produces inputs that do not look at all like my initial concrete seed, what is wrong?** + +Consider the following function. It makes a 4-byte buffer symbolic while using the concrete data stored there +as seed (``abcd``). + +.. code:: c + + int main(void) { + char buffer[4] = {'a', 'b', 'c', 'd'}; + s2e_make_symbolic(buffer, sizeof(buffer), "mydata"); + if (buffer[0] == 'd' && buffer[1] == 'c') { + printf("found"); + } + return 0; + } + +The first path (i.e., state 0) follows the concrete inputs and terminate without printing anything. +The solution is ``abcd`` (i.e., taken from the initial concrete assignments). +The second path, however, may have something like ``dc12``. The first two characters +are set by the path conditions, while the last two are arbitrary because the constraint solver may choose +random values for them while still satisfying path constraints. + +Note that in the example above, the constraint solver is called only once at the if statement in order to verify +whether the *then* branch is feasible. It it not necessary to call the solver when the path terminates in order to +get concrete solutions because each path already contains a valid assignment of concrete values (i.e., the seed). + + +**Is it possible to have forked paths have concrete inputs as close as possible to the original seed?** + +S2E does not currently implement this. In theory, one would need to ask the solver to choose values from the +original seed provided that the path constraints allow it. + +**When does concolic execution call the constraint solver to compute concrete inputs?** + +When a state forks. At this moment, the forked state gets assigned a new set of concrete inputs. These inputs will +hold true for this state until it terminates. So unlike in KLEE, there is no need for S2E to call the constraint +solver when a state terminates. + +**How does concolic execution interact with path selection heuristics?** + +It does not. Concolic execution only ensures that each state runs along the path that +corresponds to its initial set of concrete inputs. As the path runs, it may fork one or more states, which the +path selection heuristics may decide to choose. + +**How can I add additional constraints to a state from my plugin?** + +Call ``state->addConstraint(expression, true)``. The second parameter determines whether or not the engine +should recompute concrete inputs for that state in case ``expression`` does not evaluate to ``true`` in the +given state. Note that ``addConstraint`` mail fail and return false if the passed expression makes the constraint +set of the state unsatisfiable. + +**I implemented custom plugins to aggressively prune paths because symbolic execution was getting stuck. +Are these plugins still useful?** + +Yes. Reducing state space by discarding uninteresting paths is always useful. Concolic execution does not solve +the path explosion problem by itself. It merely helps getting to deeper parts of the program faster, assuming +you know the good seeds. + +**I was previously disabling concolic execution with use-concolic-execution=false. How do I migrate?** + +First, replace any calls to ``s2e_make_concolic`` with ``s2e_make_symbolic``. If you are using ``s2ecmd`` and other +S2E tools to create symbolic values, you do not need to worry about it. + +Second, if you call ``state->addConstraint(...)`` from your plugins, make sure to set the second parameter to true in +order to recompute concrete values if needed. Please also check the return value (the compiler will show a warning if +you don't). If ``addConstraint`` fails, the usual action is to kill the state as further execution of that state may be +inconsistent. + +**Why did you remove s2e_make_concolic instead of s2e_make_symbolic?** + +Historically, S2E used to have ``s2e_make_concolic`` and ``s2e_make_symbolic`` APIs in order to let guest code create +symbolic data. In concolic mode, ``s2e_make_symbolic`` used zeros as concrete data. In symbolic mode, +``s2e_make_concolic`` behaved like ``s2e_make_symbolic`` (i.e., ignoring concrete data). + +We believe that this unification makes it easier to use S2E. There is no need to worry about which API to use in which +case and no need to understand what concolic execution is. Fundamentally, both APIs create symbolic data because S2E is +first and foremost a symbolic execution engine. The "concolic" aspect is merely an improvement on top of symbolic +execution. It has no effect on the state space of the program. + +**I have cryptographic routines in my code. Can concolic execution get through them?** + +Probably not. Concolic execution will use the initial concrete values to get through cryptographic routines without +getting lost in the large state space. However, it is very likely to get stuck in the constraint solver when checking +the feasibility of a branch condition (and computing new sets of concrete inputs). + +**I want to use the content of a file as a seed for concolic execution. How do I do it?** + +Please refer to the `s2e.so <../Tutorials/BasicLinuxSymbex/s2e.so.rst>`_ tutorial, which explains all the different +ways you can create symbolic data. diff --git a/docs/src/Howtos/Coverage/ida_cat_coverage.png b/docs/src/Howtos/Coverage/ida_cat_coverage.png new file mode 100644 index 00000000..a241f5ea Binary files /dev/null and b/docs/src/Howtos/Coverage/ida_cat_coverage.png differ diff --git a/docs/src/Howtos/Coverage/index.rst b/docs/src/Howtos/Coverage/index.rst new file mode 100644 index 00000000..f8d70986 --- /dev/null +++ b/docs/src/Howtos/Coverage/index.rst @@ -0,0 +1,545 @@ +================================ +Measuring code coverage with S2E +================================ + +In this tutorial, you will learn how to measure code coverage of binaries on various OSes with S2E. +Here are some advantages of using S2E for that: + +- You do not have to recompile your binaries with GCOV support or similar in order to get + code coverage. You do not even need debug information if all you want is basic block coverage. + +- You can get code coverage for any part of the software stack (program, library, kernel modules, and even + the kernel itself). + +- You can get LCOV HTML reports when source code is available and the binary has suitable debug information + +- You can get basic block coverage reports integrated with IDA or Radare. + +- You do not even need to use symbolic execution. You could use S2E as a powerful single-path coverage tool. + +This tutorial covers the following topics: + +- How to configure S2E to record line and basic block coverage +- How to generate HTML reports with LCOV +- Examples for Linux and Windows covering programs, libraries, and kernel drivers +- How S2E records coverage information under the hood so that you can extend it for your own needs + + +.. note:: + + This tutorial assumes that you know how to set up and run simple binaries in S2E. If not, please refer to the + `s2e-env <../../s2e-env.rst>`__ documentation to get started. + + Only basic block and line coverage is supported. + Branch and function coverage is work in progress and are not supported yet. + +.. contents:: + + +Line coverage for Linux binaries +================================ + +We will use `GNU Core Utilities `__ to illustrate how to get +code coverage for Linux binaries. Coreutils are the basic file, shell and text +manipulation utilities of the GNU operating system. This part of the tutorial walks you through the process of using +S2E to analyze the ``cat`` program and generate basic block and line coverage information. + + +Building Coreutils +------------------ + +The first step is to build the coreutils package. In this tutorial, we will use version `8.26 +`__ . We will build a 32-bit version of Coreutils with debug +symbols (so that we can generate line coverage). + +.. code-block:: console + + wget https://ftp.gnu.org/gnu/coreutils/coreutils-8.26.tar.xz + tar xf coreutils-8.26.tar.xz + + cd coreutils-8.26 + mkdir build + cd build + ../configure CFLAGS="-g -m32" --prefix=$(pwd) + make + make install + +The coreutils programs will be available in ``coreutils-8.26/build/bin``. + +Running Coreutils concretely in S2E +----------------------------------- + +Run the following command in order to create a ``cat`` project. This will instruct S2E to run cat and display +the contents of ``/etc/fstab``. For the sake of simplicity, no symbolic execution will occur here (as the file +is concrete and we will not instruct S2E to make it symbolic). + +.. code-block:: console + + s2e new_project --image debian-9.2.1-i386 /path/to/coreutils-8.26/build/bin/cat /etc/fstab + +After creating the project, run S2E. This should take a few seconds and terminate. + +.. code-block:: console + + cd projects/cat + ./launch.s2e.sh + +Generating line coverage +------------------------ + +``s2e-env`` generates line coverage information in the `lcov `__ format. +The following command will generate an HTML report. + +.. code-block:: console + + s2e coverage lcov --html cat + +This will generate the following in ``projects/cat/s2e-last``: + +* A ``cat.info`` file containing the line coverage information in lcov format +* An HTML report in the ``cat_lcov`` directory + +The image below shows a snippet from the generated HTML report: + +.. image:: lcov-cat-sp1.png + :width: 640px + +.. image:: lcov-cat-sp2.png + :width: 640px + +.. note:: + + At the moment, S2E does not generate branch coverage or function coverage information, so these will be + missing from the report. + +Line coverage for shared libraries +================================== + +By default, the S2E code coverage plugin only records coverage information for the main project binary. If the +binary uses shared libraries, you need to add them manually as follows. In this example, open the ``s2e-config.lua`` +file and modify the ``ModuleExecutionDetector`` plugin configuration as follows: + +.. code-block:: lua + :caption: s2e-config.lua + + add_plugin("ModuleExecutionDetector") + pluginsConfig.ModuleExecutionDetector = { + mod_0 = { + moduleName = "cat", + }, + + mod_1 = { + # Do not forget to adapt the library name to your system. + # You can check which library your binary uses using ldd. + moduleName = "libc-2.24.so", + } + } + +Then re-generate the coverage: + +.. code-block:: console + + cd projects/cat && ./launch.s2e.sh + s2e coverage lcov --html cat + +The command should display errors about source code not found: + +.. code-block:: console + + Processing file csu/../sysdeps/x86/libc-start.c + genhtml: ERROR: cannot read /build/glibc-6LCJ6H/glibc-2.24/csu/../sysdeps/x86/libc-start.c + ERROR: [lcov] + +In order to fix this, download the source packages as follows. Make sure first that you have built the guest VM images +yourself and did not use the pre-built ones, otherwise the docker image ``linux-build-i386`` may be missing. + +.. code-block:: console + + docker run --rm -ti -v "$HOME":"$HOME" linux-build-i386 + # In the container's Bash prompt + cd /home/user/s2e/env/projects/cat + echo "deb-src http://deb.debian.org/debian/ stretch main" >> /etc/apt/sources.list + apt-get update + apt-get install dpkg-dev + + # Check out the source for the i386 version, since we run tests on the 32-bit version of the binary + apt-get source libc6:i386 + exit + +After you have run the commands above, you should have the ``glibc-2.24`` folder in your project directory. +Simply re-run the code coverage command above. You should now get the following report: + +.. image:: lcov-libc-sp1.png + :width: 640px + +Basic block coverage +==================== + +Sometimes, you do not have source code or debug information. In this case, you can use S2E to compute +basic block coverage. + +``s2e-env`` provides a subcommand that generates basic block coverage. This subcommand requires either IDA Pro, Radare +or Binary Ninja to disassemble the target binary and extract the basic blocks from it. The different disassemblers have +different requirements. + +- **Ida Pro**: You must specify the path to its location ``s2e-env`` config file. +- **Radare**: Radare must be installed into a location on your path and you must have the ``r2pipe`` Python package + installed via pip (see `here `__ for details). +- **Binary Ninja**: You must have a Binary Ninja license that allows "GUI-less processing". + +In order to produce this basic block listing you can run one of the following commands: + +- ``s2e coverage basic_block --disassembler=ida cat`` +- ``s2e coverage basic_block --disassembler=r2 cat`` +- ``s2e coverage basic_block --disassembler=binaryninja cat`` + +The basic block coverage subcommand will perform a block coverage analysis on ``s2e-last`` in the ``cat`` project by +mapping translation block coverage generated by the ``TranslationBlockCoverage`` plugin to the basic block information +extracted by your disassembler. The result will be written to ``projects/cat/s2e-last/cat_coverage.json``, part of +which is shown below. + +.. code-block:: json + :caption: cat_coverage.json + + { + "coverage": [ + { + "end_addr": 134516923, + "function": "__do_global_dtors_aux", + "start_addr": 134516916 + }, + { + "end_addr": 134516165, + "function": ".__fpending", + "start_addr": 134516160 + }, + { + "end_addr": 134515758, + "function": ".init_proc", + "start_addr": 134515754 + }, + ... + ], + "stats": { + "covered_basic_blocks": 215, + "total_basic_blocks": 1456 + } + } + +Later we will show how you can use this basic block coverage together with your chosen disassembler. + +Code coverage during symbolic execution +======================================= + +So far, we have seen how to get code coverage in concrete single-path executions. Everything works the same +when symbolic execution is enabled. Each path will get its own coverage file and ``s2e-env`` will automatically +aggregate all of them to produce a coverage report. In this section, we will configure ``cat`` to use symbolic +inputs and will measure the corresponding increase in coverage. + +First, create a new project called ``cat-symb`` as follows. This will re-generate a new configuration for ``cat`` +with symbolic execution enabled. The project that you generated earlier in this tutorial is preserved in the ``cat`` +folder. + +.. code-block:: console + + s2e new_project -n cat-symb --image debian-9.2.1-i386 /path/to/coreutils-8.26/build/bin/cat -T @@ + +The ``@@`` symbol tells ``s2e-env`` to generate a bootstrap file that will run ``cat`` with a symbolic file as input. +By default this symbolic file will be a 256 byte file filled with ``null`` bytes as concolic values. + +The ``-T`` option forces ``cat`` to display TAB characters (0x09). This is important because it forces ``cat`` to read +the symbolic values and fork two states - one state for the character being a TAB and another state for a character +being a non-TAB. + +To make symbolic execution a bit more interesting for ``cat``, we will have to modify this symbolic file slightly. +Instead of having the symbolic file filled with ``null`` bytes, we will add some actual text to the file to make it +more representative of using ``cat``. Open ``bootstrap.sh`` and replace ``truncate -s 256 ${SYMB_FILE}`` with: + +.. code-block:: bash + + python -c "print 'A' * 8" > ${SYMB_FILE} + +Then run S2E for a moment and terminate it. + +.. code-block:: console + + cd projects/cat-symb && ./launch-s2e.sh + # ... + # Terminate S2E after a while + killall -9 qemu-system-i386 + +Finally, get the code coverage: + +.. code-block:: console + + s2e coverage lcov --html cat-symb + +Compare the obtained results with the previous single-path run. Line coverage percentage should be higher. + +.. image:: lcov-cat-mp1.png + :width: 640 + +Now generate the basic block coverage (using your chosen disassembler, in this case IDA Pro): + +.. code-block:: console + + s2e coverage basic_block --disassembler=ida cat-symb + +You can then use this data for further analysis. For example, the S2E `tools `__ repo +contains an IDA Pro script to highlight the basic blocks covered by S2E during analysis. This script can be found at +``install/bin/ida_highlight_basic_blocks.py`` in your S2E environment. To run the script, open the ``cat`` binary in +IDA Pro, select "Script file" from the "File" menu and open ``install/bin/ida_highlight_basic_blocks.py``. You will be +prompted for the ``basic_block_coverage.json`` file generated by S2E. Select this file and the basic blocks executed by +S2E will be colored green. Depending on how long you let S2E run for and how many translation blocks it executed, you +should get a graph similar to the following: + +.. image:: ida_cat_coverage.png + :width: 640 + +Examining the debug log in ``s2e-last/debug.txt`` you should see a fork at address 0x8049ADE. If you look at this +address in IDA Pro, you should see a ``cmp [ebp+ch_0], 9`` at the previous instruction (address 0x8049ADA). This is +``cat`` checking if the current character is a TAB or not (as previously mentioned the ASCII value for TAB is 0x09). +Because the file contains symbolic data, a fork will occur at the ``jnz`` instruction. + +Similarly, Radare can be used to annotate the basic blocks covered by S2E with `metadata +`__. This script can be found at +``install/bin/r2_highlight_basic_blocks.py`` in your S2E environment. To run the script, open the ``cat`` binary in +Radare as follows: + +.. code-block:: console + + r2 -i install/bin/r2_highlight_basic_blocks.py projects/cat-symb/cat + +You will be prompted for the ``basic_block_coverage.json`` file generated by S2E. Enter the path to this file and the +basic blocks executed by S2E will be annotated with a ``Covered by S2E`` comment. The image below illustrates this. + +.. image:: r2_cat_coverage.png + :width: 640 + +How does S2E record and compute coverage? +========================================= + +In this section, we will take a step back and explain at a high level what components of S2E are involved to +generate coverage. This is useful in case you would like to extend S2E to generate other types of coverage. + +S2E takes as input a virtual machine and a configuration file, and outputs JSON files that contain a list of +executed program counters that belong to the modules specified in the configuration file. A module is essentially +a binary file that can be loaded and executed by the guest OS (``.exe``, ``.so``, ``.dll``, etc.), +whether in user or kernel space. + +S2E relies on three plugins to generate coverage data: an OS monitor, a module execution detector, and +a coverage generator plugin. These plugins are all enabled by default in ``s2e-config.lua`` when creating a project +with ``s2e-env``. The job of the OS monitor (e.g., ``WindowsMonitor`` or ``LinuxMonitor``) is to monitor various +system events, such as module loads/unloads, and notify any interested plugins about them. One such plugin is the +module execution detector (``ModuleExecutionDetector``). This plugin reacts to events from the OS monitor +in order to notify its own clients about instructions executed by the modules of interest. The module execution +detector filters out all the other instructions that do not belong to the modules of interest. Finally, the code +coverage plugin (i.e., ``TranslationBlockCoverage``) connects to the module execution detector plugin in order to +record executed instructions. + +The S2E coverage plugin records coverage during code *translation* rather than code *execution* in order to be more +efficient. The S2E execution engine continuously fetches blocks of guest code, translates them to host code, and +packages this host code into *translation blocks*. S2E stores translation blocks in a cache in order to avoid redundant +translations and speed up execution (e.g., if guest code executes a loop). The ``TranslationBlockCoverage`` plugin +listens for guest code translation events and records the start and end address of the block, as well as the size +of the block. S2E however only provides raw virtual addresses and the coverage plugin must first convert them to +addresses that are relative to the module's base address. ``TranslationBlockCoverage`` calls ``ModuleExecutionDetector`` +in order to do the conversion. After that, it records the converted addresses in the JSON file. This conversion is +important, as raw virtual addresses may be different from run to run (relocations, ASLR, etc.). + +Generating basic block coverage requires an off-line conversion step from translation blocks to basic blocks. +A translation block (TB) is a sequence of instructions that ends with a control flow change (e.g., call, jump). +A basic block (BB) is a translation block with the additional constraint that it may have only one entry. As a result, +a TB may span one or more BBs. A TB may also start in the middle of a basic block due to how the code +translator works (a TB may be interrupted at any time by an exception, and when execution returns, the translator +starts a fresh TB from the current program counter). The conversion will therefore take one TB and output all the BBs +it overlaps. This set of covered BBs will then be compared to the total set of BBs of the module in order to compute +basic block coverage. + +Generating line coverage is simpler: just translate each address in ``[tb_start, tb_start + tb_size[`` to a source +file, line number, and function name using the debug information. ``s2e-env`` relies on the DWARF information stored +in the binary in order to get this information and package it into a coverage file understood by LCOV. + + +.. note:: + + ``TranslationBlockCoverage`` only supports line/basic block coverage. It does not support branch coverage yet. + A branch coverage plugin would have to instrument branch instructions and record their outcome. The plugin + would listen to code translation events, check if the translated instruction is a branch, and if so insert + instrumentation code. This code would then determine the ``(source_pc, target_pc)`` pair and record it in the + coverage file. ``source_pc`` is the address of the current branch instruction and ``target_pc`` is already + stored in the program counter register when the instrumentation runs. + + +Line coverage for the Linux kernel +================================== + +The Linux kernel is just another module as far as S2E is concerned. Recording coverage for it works like for any +other program. + +.. warning:: + + Make sure you build S2E images yourself. Do not use the pre-built ones as the source information will not match + the Linux source on your system and the coverage report may be empty. + +1. Create a new project or pick an existing one. The analysis target does not matter, so you may also create + an empty project. In your S2E environment: + + .. code-block:: console + + s2e new_project -n linux-kernel -i debian-9.2.1-x86_64 --no-target --type linux + +2. Add ``vmlinux`` to the ``ModuleExecutionDetector`` plugin configuration: + + .. code-block:: lua + :caption: s2e-config.lua + + add_plugin("ModuleExecutionDetector") + pluginsConfig.ModuleExecutionDetector = { + mod_vmlinux = { + moduleName = "vmlinux", + } + } + +3. Open ``bootstrap.sh`` and add the following commands right before the kill state command at the end of the script: + + .. code-block:: bash + :caption: bootstrap.sh + + ... + + # Flush the CPU translation block cache. + # This ensures that the code coverage plugin catches as much of the kernel as possible. + # Omitting it may under-report coverage. + # This command may go anywhere, but generally right before your workload. + ${S2ECMD} flush_tbs + + # Run a kernel-intensive utility here + find /usr + + # Kill states before exiting + ${S2ECMD} kill $? "Target execution terminated" + + Alternatively, if you specified a binary to analyze during project creation, add ``${S2ECMD} flush_tbs`` + to the ``execute_target`` function in ``bootstrap.sh`` as follows. + + .. code-block:: bash + :caption: bootstrap.sh + + function execute_target { + local TARGET + TARGET="$1" + SYMB_FILE="$(prepare_inputs)" + + # ... + + # This command may go anywhere, but generally right before your workload + ${S2ECMD} flush_tbs + + # Run a kernel-intensive utility here + find / + + # ... + } + +4. Run S2E. The ``TranslationBlockCoverage`` plugins writes coverage files when states terminate so make sure + at least one state is completed before killing S2E. + + .. code-block:: console + + cd projects/linux-kernel + ./launch-s2e.sh + +5. Generate line coverage information. Do not forget the ``--include-covered-files-only`` option to keep the report + as short as possible (source files with no coverage will be omitted). + + .. code-block:: console + + s2e coverage lcov --html --include-covered-files-only linux-kernel + + This should produce a report that looks like this: + + .. image:: linux-cov.png + :width: 640px + +Line coverage for Windows binaries +================================== + +Line coverage support for Windows binaries is currently in progress. There are two main cases: + +1. Using embedded DWARF information in the executable file. + ``s2e-env`` combines ``pefile`` with ``pyelftools`` in order to read DWARF information from PE files. + Line coverage should work properly for Windows binaries compiled with DWARF information (clang, mingw, etc.). + +2. Using the PDB file produced by Microsoft compilers. + This format is not officially documented. Microsoft released some + `source code `__ that helped LLVM add support for it, so Python parsers + should come soon too. In the meantime, S2E provides a tool that converts PDB files to a JSON format + that can be read by ``s2e-env``. + + The Windows device driver testing `tutorial <../../Tutorials/WindowsDrivers/FaultInjection.rst>`__ + shows in details how to obtain line coverage for Windows kernel binaries. + + +Debugging code coverage +======================= + +The typical symptom is that the coverage report is empty. Here is a checklist to help with debugging: + +- Check that your binaries have debug information. Recompile them if needed. For Windows binaries, make sure + you generated the ``.lines`` file as instructed by the coverage tool. + +- Make sure the source code is available at the same location as when the binary was built. In most cases, + ``s2e-env`` will guess the proper location if you place the source in the project's directory. + +- If you do not get any coverage for the Linux kernel, check that you built the guest images on the same machine + where you run S2E (or at least all the files are at the same locations). + You cannot use pre-built images as their source paths are unlikely to match those on your machine. + +- Check that the ``tb-coverage*.json`` files contain adequate data. If they do not exist, check that you terminated + S2E after at least one state completed. Alternatively, you can use the ``writeCoveragePeriod`` option of the + ``TranslationBlockCoverage`` plugin in order to periodically dump coverage of the currently running state. + If coverage files are empty or are missing some modules, check that the S2E configuration is correct. + +- Check that ``ModuleExecutionDetector`` in ``s2e-config.lua`` is configured properly. If a module is missing, + ``TranslationBlockCoverage`` will not generate any coverage information for it. + +- Check that the ``Vmi`` plugin in ``s2e-config.lua`` is configured properly. You do not normally need to touch + this plugin as it is automatically configured with the right settings. If however you have a project generated with + an older version of ``s2e-env``, you may need to add new entries. It must have at a minimum the following entries: + + .. code-block:: lua + + pluginsConfig.Vmi = { + baseDirs = { + "/home/user/s2e/env/projects/my-project", + "/home/user/s2e/env/projects/my-project/guest-tools", + "/home/user/s2e/env/projects/my-project/guestfs" + } + } + + The OS monitor plugin may not be able to find your binaries if these search paths are missing. + +- Check that your project directory contains a symlink to guestfs: + + .. code-block:: console + + $ ls -l /home/user/s2e/env/projects/my_project + ... + lrwxrwxrwx 1 user user 51 Apr 8 18:34 guestfs -> /mnt/home3/guest-images/debian-9.2.1-x86_64/guestfs + ... + +- Check that ``projects/my_project/guestfs/vmlinux`` is valid and contains line information (try ``addr2line``) + +- Check that you did not delete temporary image build files, in particular the Linux source code. + + .. code-block:: console + + $ ls -l images/.tmp-output/linux-4.9.3-x86_64/linux-4.9.3/ + drwxrwxr-x 34 user user 4096 Apr 13 19:22 arch + drwxrwxr-x 3 user user 4096 Apr 13 19:14 block + drwxrwxr-x 2 user user 4096 Apr 13 19:12 certs + -rw-rw-r-- 1 user user 107982 Jan 19 13:27 config-i386 + -rw-rw-r-- 1 user user 18693 Jan 19 13:27 COPYING + -rw-rw-r-- 1 user user 98277 Jan 19 13:27 CREDITS + ... diff --git a/docs/src/Howtos/Coverage/lcov-cat-mp1.png b/docs/src/Howtos/Coverage/lcov-cat-mp1.png new file mode 100644 index 00000000..cb99c10d Binary files /dev/null and b/docs/src/Howtos/Coverage/lcov-cat-mp1.png differ diff --git a/docs/src/Howtos/Coverage/lcov-cat-sp1.png b/docs/src/Howtos/Coverage/lcov-cat-sp1.png new file mode 100644 index 00000000..11f00bb1 Binary files /dev/null and b/docs/src/Howtos/Coverage/lcov-cat-sp1.png differ diff --git a/docs/src/Howtos/Coverage/lcov-cat-sp2.png b/docs/src/Howtos/Coverage/lcov-cat-sp2.png new file mode 100644 index 00000000..40dc22e3 Binary files /dev/null and b/docs/src/Howtos/Coverage/lcov-cat-sp2.png differ diff --git a/docs/src/Howtos/Coverage/lcov-libc-sp1.png b/docs/src/Howtos/Coverage/lcov-libc-sp1.png new file mode 100644 index 00000000..75ec7fc5 Binary files /dev/null and b/docs/src/Howtos/Coverage/lcov-libc-sp1.png differ diff --git a/docs/src/Howtos/Coverage/linux-cov.png b/docs/src/Howtos/Coverage/linux-cov.png new file mode 100644 index 00000000..78764d4c Binary files /dev/null and b/docs/src/Howtos/Coverage/linux-cov.png differ diff --git a/docs/src/Howtos/Coverage/r2_cat_coverage.png b/docs/src/Howtos/Coverage/r2_cat_coverage.png new file mode 100644 index 00000000..23eabf3e Binary files /dev/null and b/docs/src/Howtos/Coverage/r2_cat_coverage.png differ diff --git a/docs/src/Howtos/Debugging.rst b/docs/src/Howtos/Debugging.rst new file mode 100644 index 00000000..7c1d6300 --- /dev/null +++ b/docs/src/Howtos/Debugging.rst @@ -0,0 +1,31 @@ +==================== +Debugging Guest Code +==================== + +It is possible to attach GDB to any running instance of S2E. S2E relies on the QEMU GDB interface, which can be enabled +with the ``-s`` command line option. This option creates a socket on port number 1234. + +.. code-block:: console + + ./i386-s2e-softmmu/qemu -s2e-config-file config.lua -s + +Once the guest is launched and the program is running, attach GDB to it. + +.. code-block:: console + + gdb /path/to/my/prog + target remote localhost:1234 + # Use gdb as usual (set breakpoints, source directories, single-step, etc.). + +Remarks +======== + +* GDB can only manipulate the current path. Use the DFS search strategy to have a coherent debugging experience. +* GDB cannot inspect symbolic variables. If you attempt to display a symbolic variable, S2E will concretize it. +* You can also debug kernel-mode code. + +Useful tips +=========== + +* At any point, if you feel that symbolic execution got stuck, attach GDB to the running S2E instance to check what + code is being executed. diff --git a/docs/src/Howtos/ExecutionTracers.rst b/docs/src/Howtos/ExecutionTracers.rst new file mode 100644 index 00000000..5966f9d9 --- /dev/null +++ b/docs/src/Howtos/ExecutionTracers.rst @@ -0,0 +1,237 @@ +======================= +Using execution tracers +======================= + +Execution tracers are S2E analysis plugins that record various information along the execution of each path. Here is a +partial list of available plugins. You can find all the tracers in the +`libs2eplugins `__ repository. + +* **ExecutionTracer**: Base plugin upon which all tracers depend. This plugin records fork points so that offline + analysis tools can reconstruct the execution tree. This plugin is useful by itself to obtain a fork profile of the + system and answer questions such as: Which branch forks the most? What is causing path explosion? + +* **ModuleTracer**: Records when and where the guest OS loads modules, programs, or libraries. Offline analysis tools + rely on this plugin to display debug information such as which line of code corresponds to which program counter. If + ``ModuleTracer`` is disabled, no debug information will be displayed. + +* **TestCaseGenerator**: Outputs a test case whenever a path terminates. The test case consists of concrete input + values that would exercise the given path. + +* **TranslationBlockTracer**: Records information about the executed translation blocks, including the program counter + of each executed block and the content of registers before and after execution. This plugin is useful to obtain basic + block coverage. + +* **MemoryTracer**: Records all memory accesses performed by a given process. This plugin also allows filtering + accesses by module in order to reduce the size of the trace. + +* **InstructionCounter**: Counts the number of executed instructions in a path for a given process or module. + +Most of the tracers record information only for the configured modules (except ``ExecutionTracer``, which records forks +anywhere in the system). For this, tracers need to know when execution enters and leaves the modules of interest. +Tracers rely on the ``ProcessExecutionDetector`` and ``ModuleMap`` plugins to obtain this information. +These two plugins rely on OS monitor plugins to be notified whenever the OS loads or unloads processes or modules. + + +1. Recording basic traces +========================= + +By default, an S2E project has all the required plugins configured in order to record forks, guest OS events +(e.g., process and module load/unload), and test cases. + +Consider the following program that reads an integer from a file and checks its value: + +.. code-block:: c + + #include + + int main(int argc, char **argv) { + FILE *fp = NULL; + int value = 0xdeadbeef; + + if (argc != 2) { + return -1; + } + + fp = fopen(argv[1], "r"); + if (!fp) { + printf("Could not open %s\n", argv[1]); + goto err; + } + + if (fread(&value, sizeof(value), 1, fp) != 1) { + goto err; + } + + if (value == 0) { + printf("0"); + } + + err: + if (fp) { + fclose(fp); + } + + return 0; + } + + +Compile the program above, create a new S2E project, then run it: + +.. code-block:: bash + + $ gcc -Wall -g -o test test.c + $ s2e new_project ./test @@ + $ s2e run test + + +If everything went well, you should have a non-empty ``ExecutionTracer.dat`` file in the project's directory: + +.. code-block:: bash + + $ ls -la projects/test/s2e-last/ + ... + -rw-rw-r-- 1 ubuntu ubuntu 4846 Dec 20 20:34 ExecutionTracer.dat + ... + + +2. Analyzing traces +=================== + +S2E comes with a few built-in tools that rely on execution traces in order to work. One of these tools is the +fork profiler: + +.. code-block:: bash + + $ s2e forkprofile test + + ... + # The fork profile shows all the program counters where execution forked: + # process_pid module_path:address fork_count source_file:line_number (function_name) + 01251 test:0x00000885 1 /home/vitaly/s2e/env/test.c:21 (main) + +The fork profiler looks for fork entries in ``ExecutionTracer.dat`` in order to aggregate them. It also extracts +module name information in order to provide symbol data (e.g., line numbers and source files). + +If you would like to look at the raw trace, you can use the ``execution_trace`` command in order to dump the trace +in JSON format: + +.. code-block:: bash + + s2e execution_trace -pp test + ... + SUCCESS: [execution_trace] Execution trace saved to /home/ubuntu/s2e/env/projects/test/s2e-last/execution_trace.json + + +This trace encodes an execution tree: + +.. code-block:: bash + + # The first entry belongs to path 0 + {...}, + {...}, + { + "children": { + "1": [ # This is the start of path 1 + {...}, + {...}, + ... + ] + } + ... + "type": "TRACE_FORK" + }, + {...}, # Path 0 continues after forking + {...}, + ... + +At the leaves of the execution tree, there are test case entries, which the ``TestCaseGenerator`` plugin creates +when a path terminates: + + +.. code-block:: bash + + # Path 1: + { + "address_space": 225800192, + "items": [ + { + "key": "v0___symfile____tmp_input___0_1_symfile___0", + "value": "AQEBAQEBAQEBAQEBAQE...." + } + ], + "pc": 134518939, + "pid": 1295, + "state_id": 1, + "timestamp": 630185690261719, + "type": "TRACE_TESTCASE" + } + + # Path 0 + { + "address_space": 225800192, + "items": [ + { + "key": "v0___symfile____tmp_input___0_1_symfile___0", + "value": "AAAAAAAAA....." + } + ], + "pc": 134518939, + "pid": 1295, + "state_id": 0, + "timestamp": 630185689994274, + "type": "TRACE_TESTCASE" + } + +You will find similar items for module/process loads/unloads. + + +3. Recording memory traces +========================== + +In this section, we will record all memory accesses done by the program above. For this, append the following snippet +to ``s2e-config.lua``: + +.. code-block:: lua + + add_plugin("MemoryTracer") + + pluginsConfig.MemoryTracer = { + traceMemory = true, + tracePageFaults = true, + traceTlbMisses = true, + + -- Restrict tracing to the "test" binary. Note that the modules specified here + -- must run in the context of the process(es) defined in ProcessExecutionDetector. + moduleNames = { "test" } + } + + +After re-running S2E and calling `s2e execution_trace -pp test` on the new run, you should be able to find the +following snippet in `execution_trace.json`: + + +.. code-block:: json + + { + "address": 140720832808700, // 0x7FFC1F4086FC + "address_space": 255279104, + "concrete_buffer": 0, + "flags": 1, + "host_address": 0, + "pc": 94739530127322, // 0x562A440A17DA + "pid": 1251, + "size": 4, + "state_id": 0, + "timestamp": 630430187009925, + "type": "TRACE_MEMORY", + "value": 3735928559 // 0xdeadbeef + } + +This corresponds to writing ``0xdeadbeef`` to the local variable ``value`` to the address ``0x7FFC1F4086FC``. + + +4. Trace format reference +========================= + +S2E uses ``protobuf`` to record traces. You can find more details about the format `here +<../Plugins/Tracers/ExecutionTracer.rst>`__. diff --git a/docs/src/Howtos/LuaInstrumentation.rst b/docs/src/Howtos/LuaInstrumentation.rst new file mode 100644 index 00000000..99443067 --- /dev/null +++ b/docs/src/Howtos/LuaInstrumentation.rst @@ -0,0 +1,335 @@ +================================= +Instrumenting guest code with Lua +================================= + +S2E comes with several plugins that expose its monitoring and instrumentation capabilities to scripts written in the +`Lua `__ programming language. This lets users create lightweight scripts without having to write and +build C++ plugins. + +There are currently two instrumentation plugins: + +* ``LuaFunctionInstrumentation``: for function call instrumentation +* ``LuaInstructionInstrumentation``: for instrumentation of individual instructions + +Instrumenting function calls and returns +======================================== + +The ``LuaFunctionInstrumentation`` plugin allows instrumenting function calls and returns. It works by monitoring +machine ``call`` and ``ret`` instructions. When execution reaches a call instruction, the plugin saves the stack pointer +that references the return address, so that when a return instruction is called, the plugin can match the stack pointer +with the corresponding call instruction. This way, users only need to specify the address of the function and need not +worry about instrumenting every return instruction of that function. + +Suppose that you want to instrument a function in ``mydll.dll``, which is loaded in a process called ``my.exe``. +The address of the function (as set by the linker) is ``0x4001050``. Your ``s2e-config.lua`` would contain the +following definitions: + + +.. code-block:: lua + + -- s2e-env configures this plugin when creating a new project for the binary my.exe. + -- If you need to monitor functions from another process, make sure that the name + -- of that process is defined here. If you forget to define it, your instrumentation + -- will not be called. + add_plugin("ProcessExecutionDetector") + pluginsConfig.ProcessExecutionDetector = { + moduleNames = { + "my.exe" + }, + } + + ... + + -- Manually enable FunctionMonitor and LuaFunctionInstrumentation plugins. + -- They are not automatically included when creating a new project. + add_plugin("FunctionMonitor") + add_plugin("LuaFunctionInstrumentation") + + -- This is the configuration for LuaFunctionInstrumentation + pluginsConfig.LuaFunctionInstrumentation = { + -- For each function to instrument, provide an entry in the "instrumentation" table + instrumentation = { + -- Define an instrumentation called "my_instrumentation". + -- This may be any string you want. + my_instrumentation = { + -- The name of the module to instrument. + -- This module can either be the name of a process specified + -- in ProcessExecutionDetector, or a dynamic library that is loaded + -- in the address space of that process. + module_name = "mylibrary.dll", + + -- The name of the Lua function to call when the guest function is called. + -- The Lua function is defined later in this script. + name = "instrumentation_func", + + -- The virtual address of the guest function to instrument. + -- You can find this address using a disassembler (e.g., IDA or objdump). + pc = 0x4001050, + + -- Number of parameters that the guest function takes. + -- This parameter may be zero unless you want to access parameters + -- from the annotation or skip the execution of + -- an stdcall function (parameters popped by the callee). + param_count = 2, + + -- Set to "true" to fork a new state when this instrumentation is triggered. + -- This can be useful, e.g., to have one state run the original function + -- and another state skip that function and replace it with a faster + -- model to speed up symbolic execution. + fork = false, + + -- Calling convention of the function. Either "cdecl" or "stdcall". + convention = "cdecl", + }, + } + } + + -- The instrumentation code goes here + function instrumentation_func(state, instrumentation_state, is_call, param1, param2) + if is_call then + -- Handle the function call + g_s2e:debug("function called!") + else + -- Handle the function return + g_s2e:debug("function returned!") + end + end + +An instrumentation function has the following arguments: + +1. A ``LuaS2EExecutionState`` object. + Gives access to the execution state. It is the equivalent of ``S2EExecutionState`` in C++ plugins. + You can use this object to read/write registers/memory or kill the state. + +2. A ``LuaInstrumentationState`` object. + Controls the behavior of the instrumentation. For example, you can instruct S2E to skip the function call + altogether. You can read more about it later in this document. + +3. A boolean value indicating whether the function is being called or is returning. + +In addition, an instrumentation function may have one or more additional parameters, each containing the address of +a guest function argument on the stack. Note that this will only work for calling conventions that use the stack +to pass their arguments. + + +Instrumenting instructions +========================== + +Suppose you are at a "Capture the Flag" (CTF) competition and are given a binary that contains an encrypted flag. To get +that flag, you need to give a correct password on the command line. The binary is obfuscated, and you cannot extract this +flag easily, so you decide to try symbolic execution. Unfortunately, this causes quite a bit of path explosion +out-of-the-box, so you still cannot get the flag. So, you look at the binary and identify two interesting program +counters: the first one is reached when the password is correct, while the second one is executed as soon as there is an +invalid character in the password. You can use this knowledge to speed up symbolic execution. When the correct program +counter is reached, you can kill all paths and exit S2E immediately. When the bad program counter is reached, however, +you can terminate the execution path in order to avoid wasting time. For this, you could configure the +``LuaInstructionInstrumentation`` plugin as follows: + +.. code-block:: lua + + add_plugin("ProcessExecutionDetector") + pluginsConfig.ProcessExecutionDetector = { + moduleNames = { + "ctf-challenge-binary" + }, + } + + pluginsConfig.LuaInstructionInstrumentation = { + -- For each instruction to instrument, provide an entry in the "instrumentation" table + instrumentation = { + -- Defines an instrumentation called "success" + success = { + -- The name of the module that we are interested in + module_name = "ctf-challenge-binary", + + -- The name of the Lua function to call when the guest executes the instruction + name = "on_success", + + -- The virtual address of the instruction in the given module + pc = 0x800123, + }, + + -- Defines an instrumentation called "failure" + failure = { + module_name = "ctf-challenge", + name = "on_failure", + pc = 0x800565, + } + } + } + + -- An instruction instrumentation takes + -- a LuaS2EExecutionState object and a LuaInstrumentationState object. + function on_success(state, instrumentation_state) + -- Do something in the success state + g_s2e:debug("Found secret!") + + -- No need to continue running S2E - terminate + g_s2e:exit(0) + end + + function on_failure(state, instrumentation_state) + -- There is no reason to continue execution any further because any other paths + -- that will fork from here will not lead to success. + state:kill(1, "Dead-end path") + end + +This is a common pattern used by other symbolic execution engines (e.g. Angr, Manticore, etc.) for solving Capture +the Flag (CTF) challenges. This pattern allows users to specify: + +1. Program path(s) that indicate the successful capture of the flag; and +2. Program path(s) to **avoid** (e.g., because they lead to some kind of failure state). + +The above Lua code defines the ``success`` and ``failure`` instrumentation. The ``success`` instrumentation calls the +``on_success`` function when the instruction at ``0x800123`` is executed in the module ``ctf-challenge`` (and +likewise for the ``failure`` instrumentation). + + +.. note:: + + For a concrete demonstration of ``LuaInstructionInstrumentation`` and ``LuaFunctionInstrumentation``, refer to + the S2E `testsuite <../Testsuite.rst>`__, which contains an + `example `__ of how to instrument a sample CTF + challenge. + + + +API Reference +============= + +As mentioned previously, all instrumentation functions take the following two arguments: + +1. A ``LuaS2EExecutionState`` object, containing the current execution state; and +2. A ``LuaInstrumentationState`` object, containing the current state of the instrumentation. + +LuaS2EExecutionState +-------------------- + +An execution state object is a wrapper around the ``S2EExecutionState`` class. It provides the following methods: + +**mem()** + Returns the current memory state in a ``LuaS2EExecutionStateMemory`` object. + +**regs()** + Returns the current register state in a ``LuaS2EExecutionStateRegisters`` object. + +**createSymbolicValue(name, size)** + Creates a new symbolic value with the given name and size (in bytes). The symbolic value is returned as a + ``LuaExpression`` object. + +**kill(status, message)** + Kills the current state with the given status code (an integer) and message. + +**getPluginProperty(plugin_name, property_name)** + Retrieves a property from the given plugin and returns it as a string. + +**setPluginProperty(plugin_name, property_name, value)** + Sets a plugin property with the given string value. + +**debug(message)** + Writes the given message string to the debug log. + + +LuaS2EExecutionStateMemory +-------------------------- + +This is a wrapper around the ``S2EExecutionStateMemory`` class. + +**readPointer(address)** + Read a (concrete) pointer at the given address. + +**readBytes(address, size)** + Read a string of (concrete) bytes from the given address. + +**write(address, expr)** + Write a ``LuaExpression`` object at the given address. + +**write(address, value, size)** + Write the given ``value`` at the specified ``address``. + +**makeSymbolic(address, size, name)** + Make a region of memory symbolic. + +LuaS2EExecutionStateRegisters +----------------------------- + +This is a wrapper around the ``S2EExecutionStateRegisters`` class. + +**getPc()** + Return the current program counter. + +**getSp()** + Return the current stack pointer. + +**read(pointer, size)** + Read the register offset by ``pointer``. + +**write(pointer, expr)** + Write the ``LuaExpression`` object at the register offset by ``pointer``. + +**write(pointer, value, size)** + Write the given value at the register offset by ``pointer``. + +LuaFunctionInstrumentationState +------------------------------- + +An object of this type provides the following methods: + +**skipFunction(skip)** + Set ``skip`` to ``true`` in order to skip the function call. + +**isChild()** + Returns ``true`` if the instrumentation state is a forked child. This is used when ``fork = true`` is set + in the configuration. + +**setExitCpuLoop(exit)** + Set ``skip`` to ``true`` in order to exit the CPU loop when the instrumentation returns. + This may be useful if you modify the program counter in your instrumentation code. + +LuaInstructionInstrumentationState +---------------------------------- + +An object of this type provides the following methods: + +**skipInstruction(skip)** + Set ``skip`` to ``true`` in order to skip the instruction after the Lua function returns. + +**setExitCpuLoop(exit)** + Set ``skip`` to ``true`` in order to exit the CPU loop when the instrumentation returns. + This may be useful if you modify the program counter in your instrumentation code. + + +LuaExpression +------------- + +This wrapper around a ``klee::Expr`` object. + +.. warning:: + + Symbolic expression support in Lua scripts is currently experimental and limited. + + +The ``g_s2e`` object +-------------------- + +Finally, the global ``g_s2e`` object is available throughout `s2e-config.lua`. It provides the following methods: + +**debug(message)** + Write the given message string to the debug log. + +**info(message)** + Write the given message string to the info log. + +**warning(message)** + Write the given message string to the warning log. + +**exit(return_code)** + Exit S2E with the given return code. + +**getPlugin(plugin_name)** + Return a reference to the specified plugin. This allows Lua scripts to interact with compatible C++ S2E plugins. + +.. note:: + + Only a fraction of the APIs available to C++ plugins are exposed to Lua. If you find that an API is missing, + add it by modifying the corresponding ``LuaXXX.cpp`` file. diff --git a/docs/src/Howtos/Parallel.rst b/docs/src/Howtos/Parallel.rst new file mode 100644 index 00000000..e16b7763 --- /dev/null +++ b/docs/src/Howtos/Parallel.rst @@ -0,0 +1,40 @@ +============ +Parallel S2E +============ + +S2E can be run in multi-process mode in order to speed up path exploration. Each process is called a worker. Each +worker periodically checks whether there are processor cores available, and if yes, forks itself. The child worker +inherits half of the states of the parent. + +To enable multi-process mode, set the environment variable ``S2E_MAX_PROCESSES=XX``, where ``XX`` is the maximum number +of S2E instances you would like to have. + +Add the ``-nographic`` option as it is not possible to fork a new S2E window. + +How do I process generated traces? +---------------------------------- + +In multi-process mode, S2E outputs traces in the ``s2e-last/XX`` folders, where ``XX`` is the sequence number of the +S2E instance. S2E increments this number each time it launches a new instance. Note that instances can also terminate, +e.g., when they finish exploring their respective state subtree. + +Each trace file contains a subtree of the global execution tree. Therefore, analysis tools must process the traces in +the relative order of their creation. The relative order is defined by the sequence number of the instance. This can be +done by specifying multiple ``-trace`` arguments to the offline analysis tools. For example, generating the fork +profile of a multi-core run can be done as follows: + +.. code-block:: console + + $S2EDIR/build-s2e/tools-release/tools/forkprofiler/forkprofiler -outputdir=s2e-last/ \ + -trace=s2e-last/0/ExecutionTracer.dat -trace=s2e-last/1/ExecutionTracer.dat \ + -trace=s2e-last/2/ExecutionTracer.dat -trace=s2e-last/3/ExecutionTracer.dat + +Limitations +----------- + +* S2E can only run on a shared-memory architecture. S2E cannot start on one machine and fork new instances on other + machines for now. +* It is not possible to have a separate S2E window for each process for now. If you start with ``-nographic``, you will + not be able to manipulate the console. To start the program that you want to symbolically execute in the guest, use + the `HostFiles <../MovingFiles.rst>`__ plugin or the ``-vnc :1`` option. +* Because S2E uses the ``fork`` system call, S2E cannot run on Windows in multi-core mode. diff --git a/docs/src/Howtos/WritingPlugins.rst b/docs/src/Howtos/WritingPlugins.rst new file mode 100644 index 00000000..2506449a --- /dev/null +++ b/docs/src/Howtos/WritingPlugins.rst @@ -0,0 +1,302 @@ +========================== +How to Write an S2E plugin +========================== + +In this tutorial, we show step-by-step how to write a complete plugin that uses most of the features of the S2E plugin +infrastructure. We take the example of a plugin that counts how many times a specific instruction has been executed. +Users of that plugin can specify the instruction to watch in the S2E configuration file. We will also show how to build +the plugin so that it can communicate with other plugins and expose reusable functionality. + +Starting with an Empty Plugin +============================= + +The first thing to do is to name the plugin and create boilerplate code. Let us name the plugin ``InstructionTracker``. +You can copy/paste the ``Example`` plugin that ships with S2E (in the `libs2eplugins +`__ repository). + +Create a file named ``InstructionTracker.h`` in ``$S2EDIR/libs2eplugins/src/s2e/Plugins`` with the following content: + +.. code-block:: cpp + + #ifndef S2E_PLUGINS_INSTRTRACKER_H + #define S2E_PLUGINS_INSTRTRACKER_H + + // These header files are located in libs2ecore + #include + #include + #include + + namespace s2e { + namespace plugins { + + class InstructionTracker : public Plugin { + S2E_PLUGIN + + public: + InstructionTracker(S2E *s2e) : Plugin(s2e) {} + + void initialize(); + }; + + } // namespace plugins + } // namespace s2e + + #endif + +Then, create the corresponding ``InstructionTracker.cpp`` file in the same directory: + +.. code-block:: cpp + + #include + + #include "InstructionTracker.h" + + namespace s2e { + namespace plugins { + + // Define a plugin whose class is InstructionTracker and called "InstructionTracker" + S2E_DEFINE_PLUGIN(InstructionTracker, // Plugin class + "Tutorial - Tracking instructions", // Description + "InstructionTracker", // Plugin function name + // Plugin dependencies would normally go here. However this plugin does not have any dependencies + ); + + void InstructionTracker::initialize() { + } + + } // namespace plugins + } // namespace s2e + +Finally, we need to compile the plugin with the rest of S2E. For this, add the following line to +``$S2EDIR/libs2eplugins/src/CMakeLists.txt``, near the other plugin declarations:: + + s2e/Plugins/InstructionTracker.cpp + +Reading Configuration Parameters +================================ + +We would like to let the user specify which instruction to monitor. For this, we create a configuration variable that +stores the address of that instruction. Every plugin can have an entry in the S2E configuration file. The entry for our +plugin would look like this: + +.. code-block:: lua + + pluginsConfig.InstructionTracker = { + -- The address we want to track + addressToTrack = 0x12345, + } + +We also need to add our plugin to the ``Plugins`` table. This can be done one of two ways: + +* If using ``s2e-env``, add a call to ``add_plugin`` to ``s2e-config.lua``. + +.. code-block:: lua + + add_plugin("InstructionTracker") + +* If not using ``s2e-env``, you will have to create the ``Plugins`` table yourself: + +.. code-block:: lua + + Plugins = { + -- List other plugins here + "InstructionTracker", + } + +If we run the plugin as it is now, nothing will happen. S2E ignores any unknown configuration value. We need a +mechanism to explicitly retrieve the configuration value. In S2E, plugins can retrieve the configuration at any time. +In our case, we do it during the initialization phase. + +.. code-block:: cpp + + // From libs2ecore. We need this to read configuration files + #include + + // ... + + void InstructionTracker::initialize() { + m_address = (uint64_t) s2e()->getConfig()->getInt(getConfigKey() + ".addressToTrack"); + } + +Do not forget to add ``uint64_t m_address;`` as a private members of class ``InstructionTracker`` in +``InstructionTracker.h``. + +Instrumenting Instructions +========================== + +To instrument an instruction, an S2E plugin registers to the ``onTranslateInstructionStart`` core event. There are +many other core events to which a plugin can register. These events are defined in ``CorePlugin.h`` in the +`libs2ecore `__ repository. + +Extend your code as follows. Do not forget to add all new member functions to the (private) section of the class +declaration. + +.. code-block:: cpp + + // From libs2ecore. Provides the hexval function + #include + + void InstructionTracker::initialize() { + m_address = (uint64_t) s2e()->getConfig()->getInt(getConfigKey() + ".addressToTrack"); + + // This indicates that our plugin is interested in monitoring instruction translation. + // For this, the plugin registers a callback with the onTranslateInstruction signal. + s2e()->getCorePlugin()->onTranslateInstructionStart.connect( + sigc::mem_fun(*this, &InstructionTracker::onTranslateInstruction)); + } + + void InstructionTracker::onTranslateInstruction(ExecutionSignal *signal, + S2EExecutionState *state, + TranslationBlock *tb, + uint64_t pc) { + if(m_address == pc) { + // When we find an interesting address, ask S2E to invoke our callback when the address is actually + // executed + signal->connect(sigc::mem_fun(*this, &InstructionTracker::onInstructionExecution)); + } + } + + // This callback is called only when the instruction at our address is executed. + // The callback incurs zero overhead for all other instructions + void InstructionTracker::onInstructionExecution(S2EExecutionState *state, uint64_t pc) { + s2e()->getDebugStream() << "Executing instruction at " << hexval(pc) << '\n'; + // The plugins can arbitrarily modify/observe the current execution state via the execution state pointer. + // Plugins can also call the s2e() method to use the S2E API + } + +Counting Instructions +===================== + +We would like to count how many times that particular instruction is executed. There are two options: + +1. Count how many times it was executed across all paths +2. Count how many times it was executed in each path + +The first option is trivial to implement. Simply add an additional member to the class and increment it every time the +``onInstructionExecution`` callback is invoked. + +The second option requires to keep per-state plugin information. S2E plugins manage per-state information in a class +that derives from ``PluginState``. This class must implement a ``factory`` method that returns a new instance of the +class when S2E starts symbolic execution. The ``clone`` method is used to fork the plugin state. Both ``factory`` and +``clone`` **must** be implemented. + +Here is how ``InstructionTracker`` could implement the plugin state. + +.. code-block:: cpp + + class InstructionTrackerState : public PluginState { + private: + int m_count; + + public: + InstructionTrackerState() { + m_count = 0; + } + + virtual ~InstructionTrackerState() {} + + static PluginState *factory(Plugin*, S2EExecutionState*) { + return new InstructionTrackerState(); + } + + InstructionTrackerState *clone() const { + return new InstructionTrackerState(*this); + } + + void increment() { + ++m_count; + } + + int get() { + return m_count; + } + }; + +Plugin code can refer to this state using the ``DECLARE_PLUGINSTATE`` macro: + +.. code-block:: cpp + + void InstructionTracker::onInstructionExecution(S2EExecutionState *state, uint64_t pc) { + // This macro declares the plgState variable of type InstructionTrackerState. + // It automatically takes care of retrieving the right plugin state attached to the specified execution state + DECLARE_PLUGINSTATE(InstructionTrackerState, state); + + s2e()->getDebugStream() << "Executing instruction at " << hexval(pc) << '\n'; + + // Increment the count + plgState->increment(); + } + +Exporting Events +================ + +All S2E plugins can define custom events. Other plugins can in turn connect to them and also export their own events. +This scheme is heavily used by stock S2E plugins. For example, the `LinuxMonitor <../Plugins/Linux/LinuxMonitor.rst>`__ +plugin exports a number of events (e.g. segmentation fault, module load, etc.) that can be intercepted by your own +plugins. + +In this tutorial, we show how ``InstructionTracker`` can expose an event and trigger it when the monitored instruction +is executed ten times. + +First, we declare the signal as a ``public`` field of the ``InstructionTracker`` class. It is important that the field +be public, otherwise other plugins will not be able to register. + +.. code-block:: cpp + + class InstructionTracker : public Plugin { + // ... + + public: + sigc::signal onPeriodicEvent; + + //... + } + +Second, we add some logic to trigger the event and invoke the registered callbacks. + +.. code-block:: cpp + + void InstructionTracker::onInstructionExecution(S2EExecutionState *state, uint64_t pc) { + DECLARE_PLUGINSTATE(InstructionTrackerState, state); + + s2e()->getDebugStream() << "Executing instruction at " << hexval(pc) << '\n'; + + plgState->increment(); + + // Trigger the event + if ((plgState->get() % 10) == 0) { + onPeriodicEvent.emit(state, pc); + } + } + +That is all we need to define and trigger an event. To register for this event, a plugin invokes +``s2e()->getPlugin()``, where ``PluginName`` is the name of the plugin as defined in the +``S2E_DEFINE_PLUGIN`` macro. In our case, a plugin named ``MyClient`` would do something like this in its +initialization routine: + +.. code-block:: cpp + + // Include the plugin's header file + #include + + // Specify dependencies + S2E_DEFINE_PLUGIN(MyClient, "We use InstructionTracker", "MyClient", "InstructionTracker"); + + void MyClient::initialize() { + // Get the instance of the plugin + Instructiontracker *tracker = s2e()->getPlugin(); + + // Register to custom events + tracker->onPeriodicEvent.connect(/* Connect a handler method */); + } + +Note that S2E enforces the plugin dependencies specified in the ``S2E_DEFINE_PLUGIN`` macro. If a dependency is not +satisfied (e.g., the plugin is not enabled in the configuration file or is not compiled in S2E), S2E will not start and +emit an error message instead. + +It is not always necessary to specify the dependencies. For example, a plugin may want to work with reduced +functionality if a dependent plugin is missing. Attempting to call ``s2e()->getPlugin()`` returns ``nullptr`` if +the requested plugin is missing. diff --git a/docs/src/ImageInstallation.rst b/docs/src/ImageInstallation.rst new file mode 100644 index 00000000..d02d176a --- /dev/null +++ b/docs/src/ImageInstallation.rst @@ -0,0 +1,146 @@ +=========================== +Customizing stock VM images +=========================== + +This tutorial will give you an overview of the guest image building system provided with S2E so that you can start +customizing the images for your own needs. You will learn the installation steps and how to customize them +to install additional software in the guest images. + +.. warning:: + + Building S2E images manually is **not** recommended. There are too many steps and messing them up is very easy, + even if you are an advanced user. You will save dozens of hours in the long run if you use the automated + image building scripts provided with S2E. + +.. contents:: + + +Image creation overview +======================= + +Before you proceed further, read the following documents: + +1. The `s2e-env `__ documentation in order to understand how to build S2E images from the command line. + This shows the workflow that you will use routinely. + +2. The `guest-images `__ repository documentation. This + repository contains the actual image building scripts. They produce fully-functional S2E disk images, with all the + required software installed. As a user, you do not need to call these scripts directly, ``s2e-env`` will do it for + you. You will need to modify them if you wish to add support for new OSes or install additional software in the + stock guest images. + +At a high level, image build creation scripts proceed as follows: + +1. Fetch the ISO images (or have the user supply them in case of proprietary OSes). + +2. Customize the images in order to enable unattended installation. The actual mechanics depend on the OS, but + most commonly the installation scripts copy the additional software to a second ISO that will be read by the + OS installer. + +3. Run the installation. The OS will first perform its own installation, then will install any additional software. + +4. Take a ready-to-run snapshot. The S2E launch script resumes the snapshot when starting the analysis. The snapshot + is built in such a way that when it is resumed, it automatically retrieves a bootstrap script from the host. + The bootstrap script contains further commands on how to set up the analysis. This kind of setup makes it more + convenient to re-run analyses as often as needed without having to wait for the guest to boot. + +The S2E image format is identical to the RAW format, except that the image file name has the ``.s2e`` extension. +Snapshots are saved in a ``.s2e.snapshot_name`` file, alongside the base image. + +.. warning:: + + When you copy S2E images, make sure to preserve time stamps of both the base image and the snapshots + (i.e., use ``cp -p``). If the timestamps do not match, the snapshots will fail to resume. This is a protection + to avoid resuming snapshots in case the base image was modified. + + +Building Linux images +===================== + +In this section, we briefly discuss how Linux images are built. + +S2E ships with custom Linux kernels, one vanilla and one customized for DARPA CGC binary analysis. The kernels are +available in this `repository `__. Basic build instructions are available in +the README. We recommend however that you study the ``guest-images`` Makefiles, which build the kernel in a Docker image +on the host, then inject the ``.deb`` files in the disk images during image creation. + +.. warning:: + + Although S2E can run any Linux kernel, we recommend you use a kernel that is compatible with the `LinuxMonitor + `__ plugin. This allows S2E plugins to track process creation/termination, + segmentation faults, etc. S2E will still work and symbolic execution will still function, but you will have to do + everything manually if ``LinuxMonitor`` is unavailable. + + +Building Windows images +======================= + +This works exactly like for Linux images, except that there is no need to re-compile the Windows kernel. +S2E ships with a custom Windows driver that exposes the kernel events to S2E plugins. This driver is available in +the `guest-tools `__ repository. + + +When should I install my software? +================================== + +If you wish to install your own software, there are two ways to go about it: + +1. Installing during image build + + In order to install software in the image, modify the scripts in the ``guest-images`` repo. They already install + various software, you can use that as an example. This method is preferred if your software is large and does not + change. + +2. Installing every time you start S2E + + Modify ``bootstrap.sh``, add ``s2eget`` to download and run your software. This method is preferred if you intend to + change your software very frequently. It adds start up overhead. + + +The S2E VM image format +======================= + +Existing image formats are not suitable for multi-path execution, because they usually mutate internal bookkeeping +structures on read operations. Worse, they may write these mutations back to the disk image file, causing VM image +corruptions. QCOW2 is one example of such formats. + +The S2E image format, unlike the other formats, is multi-path aware. When in S2E mode, writes are local to each state +and do not clobber other states. Moreover, writes are NEVER written to the image (or the snapshot). This makes it +possible to share one disk image and snapshots among many instances of S2E. + +The S2E image format is identical to the RAW format, except that the image file name has the ``.s2e`` extension. +Therefore, to convert from RAW to S2E, renaming the file is enough (a symlink is fine too). + +The S2E image format stores snapshots in a separate file, suffixed by the name of the snapshot. For example, if the +base image is called "my_image.raw.s2e", the snapshot ``ready`` (as in ``savevm ready``) will be saved in the file +``my_image.raw.s2e.ready`` in the same folder as ``my_image.raw.s2e``. + + +General guidelines for VM images +================================ + +When running in S2E mode, the image **must** be in S2E format. S2E does not support any other image format. + +Any x86 image that boots in vanilla QEMU will work in S2E. However, we enumerate a list of tips and optimizations that +will make administration easier and symbolic execution faster. These guidelines are also followed by the guest image +installation scripts. + +* Keep fresh copies of your disk images. It is recommended to start with a fresh copy for each analysis task. For + instance, if you use an image to test a device driver, avoid using this same image to analyze some spreadsheet + component. One image = one analysis. It is easier to manage and your results will be easier to reproduce. + +* It is recommended to use as little RAM as possible for the guest OS. S2E is not limited by the amount of memory in any + way (it is 64-bit), but your physical machine is. Larger guest memory will also add additional management overhead and + result in longer snapshot resume times. 128-256MB is a good setting for basic Linux images. Windows requires at + least 2GB. The amount of memory for an image can be set in the ``guest-images`` scripts. + +* Disable fancy desktop themes. Most OSes have a GUI, which consumes resources. Disabling all visual effects will make + program analysis faster. + +* Disable the screen saver. + +* Disable swap. It is important that the program data is not swapped out during symbolic execution, as this will + force the concretization of all symbolic data. + +* Disable unnecessary services to save memory and speed up the guest. Services like file sharing, printing, wireless + network configuration, or firewall are useless unless you want to test them in S2E. diff --git a/docs/src/MovingFiles.rst b/docs/src/MovingFiles.rst new file mode 100644 index 00000000..49048b75 --- /dev/null +++ b/docs/src/MovingFiles.rst @@ -0,0 +1,87 @@ +============================================ +Copying files between the host and the guest +============================================ + +The ``s2eget`` and ``s2eput`` tools allow transfering files between the guest virtual machine and the host during +symbolic execution even when there is no network available. If you use ``s2e-env`` and stock VM images, these two +utilities are already included in the guest VM. + +``s2eget`` +---------- + +In order to copy a file from the host to the guest VM, add the following command to the desired location in +your ``bootstrap.sh``: + +.. code-block:: console + + ./s2eget + +```` specifies the file to copy from the host into the guest. Note that the filename argument to ``s2eget`` +must be specified relative to the ``HostFiles`` plugin base directory (see below). + +If ```` is an executable file that you want to run, you may use the following command: + +.. code-block:: console + + ./s2eget && chmod +x ./ && ./ + +.. note:: + + To see a real example of how to use this command, have a look at the ``bootstrap.sh`` file generated by ``s2e-env``. + This script already contains several invocations of ``s2eget``, which copy the binary to analyze from the host to + the guest VM, together with several other support files. + + +``s2eput`` +---------- + +To copy a file ``fileName`` from the guest to the host, run the following command: + +.. code-block:: console + + ./s2eput fileName + +This uploads ``fileName`` in the guest to the ``s2e-last/outfiles`` directory on the host. This is useful, e.g., if a +program that you are analyzing generates a core dump and you want to inspect it on your host. + +.. note:: + + The upload will fail if a file with the same name already exists in the outfiles directory. + + +Setting up the HostFiles Plugin +------------------------------- + +``s2eget`` and ``s2eput`` require configuring the ``HostFiles`` plugin in the S2E configuration file +(``s2e-config.lua``). The configuration looks as follows: + +.. code-block:: lua + + plugins = { + ... + + "HostFiles", + } + + pluginsConfig.HostFiles = { + baseDirs = { + "/path/to/host/dir1", + "/path/to/host/dir2", + }, + + -- This option must be enabled for s2eput to work + allowWrite = true, + } + +.. note:: + + If you use s2e-env, the basic configuration is already there. You do not have to modify it unless you need + to access files that are stored outside of your project folder. Do not change the existing directory configuration + (in particular guestfs folder paths). + + +The ``pluginsConfig.HostFiles.baseDirs`` configuration option specifies what directories on the host should be shared +with the guest. The paths can be either absolute, relative, or empty. If an empty directory is specified the S2E output +directory will be exported. + +The ``pluginsConfig.HostFiles.allowWrite`` must be set to ``true`` for allowing writes to the base directories. diff --git a/docs/src/Plugins/BaseInstructions.rst b/docs/src/Plugins/BaseInstructions.rst new file mode 100644 index 00000000..50a2a9c1 --- /dev/null +++ b/docs/src/Plugins/BaseInstructions.rst @@ -0,0 +1,42 @@ +================ +BaseInstructions +================ + +S2E provides a means for guest code code to communicate with plugins by providing a special machine instruction. +S2E extends the x86 instruction set with a custom instruction. When guest code executes this instruction, S2E +invokes plugins that listen for that instruction (using the ``CorePlugin::onCustomInstruction`` event). +The instruction has the following format: + +:: + + # S2E custom instruction format + 0f 3f XX XX YY YY YY YY YY YY + + XX: 16-bit instruction code. Each plugin should have a unique one. + YY: 6-bytes operands. Freely defined by the instruction code. + +The ``BaseInstructions`` plugin uses the above format to implement basic functionality (e.g., creating symbolic +variables, get current state id, check if a memory location is symbolic, etc.). Refer to +``guest/common/include/s2e/s2e.h`` for a complete set of APIs. + +Plugins should not define new custom instruction codes. There are two problems with this format: +(1) one needs to manually allocate plugin-specific opcodes and (2) each plugin is forced to listen to all S2E +instruction invocations and filter out those of no interest. + +Instead, plugins should implement the ``IPluginInvoker`` interface. This interface provides a method that +the ``BaseInstructions`` plugin calls when the guest invokes the ``s2e_invoke_plugin()`` API. This API lets guest +code pass arbitrary data to specific plugins. Each plugin can define its own data format. + +.. code-block:: c + + # Definition in s2e.h + static inline int s2e_invoke_plugin(const char *pluginName, void *data, uint32_t dataSize); + + ... + + s2e_invoke_plugin("MyPlugin", &command, sizeof(command)); + + + +See the `source code `__ for more information +about custom instructions. diff --git a/docs/src/Plugins/EdgeKiller.rst b/docs/src/Plugins/EdgeKiller.rst new file mode 100644 index 00000000..4d6b4033 --- /dev/null +++ b/docs/src/Plugins/EdgeKiller.rst @@ -0,0 +1,37 @@ +========== +EdgeKiller +========== + +The ``EdgeKiller`` plugin looks for the execution of a sequence of program counters and kills all the paths where this +sequence occurs. This is useful to kill polling loops. + +Options +------- + +The configuration requires one section per module to be monitored. The section name must match the module identifier +defined in the configuration section of the `ModuleExecutionDetector `__ plugin. Each +section contains a list of named pairs of program counters that define the program edges. All program counters are +relative to the native load base of the module. The name of each pair is not important, but must be unique. + +Required Plugins +---------------- + +* `ModuleExecutionDetector `__ + +Configuration Sample +-------------------- + +The following example shows how to kill the polling loops in the ``pcntpci5.sys`` device driver. Each pair of addresses +represents the source and the target of a polling loop back-edge. + +.. code-block:: lua + + pluginsConfig.EdgeKiller = { + pcntpci5_sys_1 = { + l1 = {0x14040, 0x1401d}, + l2 = {0x139c2, 0x13993}, + l3 = {0x14c84, 0x14c5e}, + } + } + + diff --git a/docs/src/Plugins/FunctionMonitor.rst b/docs/src/Plugins/FunctionMonitor.rst new file mode 100644 index 00000000..09b3360e --- /dev/null +++ b/docs/src/Plugins/FunctionMonitor.rst @@ -0,0 +1,69 @@ +=============== +FunctionMonitor +=============== + +The ``FunctionMonitor`` plugin notifies other plugins of function calls and returns. + +Suppose that you are writing a plugin that needs to monitor calls to a function that starts at address ``0x400120`` in a +process called ``my.exe``. Your plugin also wants to monitor all returns from that function. + +Proceed as follows: + +1. Create a new plugin. You can use the ``Example`` plugin as a template. +2. Obtain the instance of the ``FunctionMonitor`` plugin in the ``initialize()`` method of the plugin. +3. Register a handler for the ``onCall`` signal provided by ``FunctionMonitor``. +4. In the call handler, check the current program counter to determine if this is a function that you want to + process further or not. You can also use information about callers and callees provided by the signal to make + this decision. +5. In the call handler, register a return handler, if necessary. +6. Do not forget to add ``my.exe`` to the configuration of ``ProcessExecutionDetector`` in ``s2e-config.lua``. + +The following part shows the code that implements the steps explained above. + +.. code-block:: cpp + + // 1. Write a new analysis plugin (e.g., based on the Example plugin) + void Example::initialize() { + // 2. Get an instance of the FunctionMonitor plugin + FunctionMonitor *monitor = s2e()->getPlugin(); + + // 3. Get a notification when a function is called + monitor->onCall.connect(sigc::mem_fun(*this, &Example::onCall)); + } + +To monitor the function located at ``0x400120``, write the following handlers: + +.. code-block:: cpp + + void Example::onCall(S2EExecutionState *state, const ModuleDescriptorConstPtr &source, + const ModuleDescriptorConstPtr &dest, uint64_t callerPc, uint64_t calleePc, + const FunctionMonitor::ReturnSignalPtr &returnSignal) { + // Filter out functions we don't care about + if (state->regs()->getPc() != 0x400120) { + return; + } + + // If you do not want to track returns, do not connect a return signal. + // Here, we pass the program counter to the return handler to identify the function + // from which execution returns. + returnSignal->connect( + sigc::bind(sigc::mem_fun(*this, &Example::onRet), 0x400120)); + } + + void Example::onRet(S2EExecutionState *state, const ModuleDescriptorConstPtr &source, + const ModuleDescriptorConstPtr &dest, uint64_t returnSite, + uint64_t functionPc) { + getDebugStream(state) << "Execution returned from function " << hexval(functionPc) << "\n"; + } + + +Call/return handlers are paired: whenever the return instruction is executed +and the stack pointer corresponds to the one at the call instruction, the return handler tied to that call is executed. + +You can pass as many parameters as you want to your call or return handlers. For this, you can use the ``fsigc++`` +``bind`` feature. + +.. note:: + + You can also instrument functions from Lua code using the ``LuaFunctionInstrumentation`` + `plugin <../Howtos/LuaInstrumentation.rst>`__. diff --git a/docs/src/Plugins/Linux/FunctionModels.rst b/docs/src/Plugins/Linux/FunctionModels.rst new file mode 100644 index 00000000..21c49c39 --- /dev/null +++ b/docs/src/Plugins/Linux/FunctionModels.rst @@ -0,0 +1,65 @@ +============== +FunctionModels +============== + +The ``FunctionModels`` plugin can be used to reduce `path explosion +`__. It works by replacing typical libc functions with +a symbolic expression that represents that function's return value. The ``FunctionModels`` plugin currently supports +the following libc functions: + +* ``strcpy`` +* ``strncpy`` +* ``strcat`` +* ``strncat`` +* ``strcmp`` +* ``strstrncmp`` +* ``memcpy`` +* ``memcmp`` +* ``printf`` +* ``fprintf`` + +For example, ``strlen`` is typically implemented as follows: + +.. code-block:: c + + // Adapted from µlibc + size_t strlen(const char *s) { + for (const char *p = s; *p; p++) ; + + return p - s; + } + +If the input string ``s`` is symbolic, each iteration of the ``for`` loop will result in a state fork. This will +quickly grow intractible as the length of the input string grows. Instead of symbolically executing this function and +forking states, the ``FunctionModels`` plugin will return a symbolic expression that essentially "merges" these states +and return a symbolic expression that describes the string length (see the ``strlenHelper`` function `here +`__ +to see how this is done). + +The astute reader will note that while this will reduce the number of forked states that S2E must explore, it will do +so by increasing the complexity of the path constraints. This may put pressure on the constraint solver and cause it to +take more time to solve path constraints. It is up to the user to decide if this is an acceptable trade-off. + +The ``FunctionModels`` plugin uses `s2e.so <../../Tutorials/BasicLinuxSymbex/s2e.so.rst>`__ to replace the function calls (e.g. ``strlen``, +``memcpy``, etc.) with calls to the functions in ``guest/linux/function_models/models.c``. The functions in +``models.c`` determine whether any of the function arguments are symbolic, and if so invoke the ``FunctionModels`` +plugin to generate the appropriate symbolic expression. Function call replacement relies on the target program being +dynamically linked, so **you cannot use function models on statically-linked binaries**. + +To use function models, enable the ``FunctionModels`` plugin in your S2E configuration file and use ``LD_PRELOAD`` to +load ``s2e.so`` in your ``bootstrap.sh`` script. If you are using `s2e-env <../../s2e-env.rst>`__ you will be informed +at project creation time whether you can use the ``FunctionModels`` plugin. Note however that by default ``s2e-env`` +will **not** automatically enable the ``FunctionModels`` plugin. + +Testing +------- + +There is a test suite suite available in ``guest/linux/function_models/models_test.c``. This test suite is compiled +along with the guest tools and placed in ``$S2EDIR/build-s2e/guest-tools{32,64}/linux/function_models``. It can be +used with the ``s2e-env new_project`` command to run in a guest virtual machine. You should see the "Good Model" +message in S2E's debug output. + +Options +------- + +This plugin does not have any options. diff --git a/docs/src/Plugins/Linux/LinuxMonitor.rst b/docs/src/Plugins/Linux/LinuxMonitor.rst new file mode 100644 index 00000000..5c5d4799 --- /dev/null +++ b/docs/src/Plugins/Linux/LinuxMonitor.rst @@ -0,0 +1,25 @@ +============ +LinuxMonitor +============ + +The ``LinuxMonitor`` plugin intercepts process creation/termination, segmentation faults, module load/unload and traps +in S2E. This is achieved by using a specially-modified version of the `Linux kernel +`__ and a dynamically-loaded kernel module. Upon particular events occurring +(e.g. process creation), the kernel will execute a custom instruction that is interpreted and handled by the +``LinuxMonitor`` plugin. ``LinuxMonitor`` exports these events for other plugins to intercept and process as required. + +Options +------- + +terminateOnSegfault=[true|false] + Set to ``true`` to terminate the currently-executing state when a segmentation fault occurs. + +terminateOnTrap=[true|false] + Set to ``true`` to terminate the currently-executing state when a trap occurs (e.g. divide-by-zero, invalid opcode, + etc.). If you are using a debugger inside the guest VM then you should set this option to ``false`` because it will + also intercept breakpoints. + +Required Plugins +---------------- + +None diff --git a/docs/src/Plugins/ModuleExecutionDetector.rst b/docs/src/Plugins/ModuleExecutionDetector.rst new file mode 100644 index 00000000..88204611 --- /dev/null +++ b/docs/src/Plugins/ModuleExecutionDetector.rst @@ -0,0 +1,25 @@ +======================= +ModuleExecutionDetector +======================= + +The ``ModuleExecutionDetector`` plugin signals to other plugins when execution enters or leaves a module of interest. +It relies on an OS monitor to get the location of the modules in memory. + + +Configuration Sample +-------------------- + +The configuration sample below will make ``ModuleExecutionDetector``: + +- notify other plugins when execution enters or leaves ``myprogram`` (if ``trackExecution`` is set to ``true``) +- notify other plugins when the DBT translates code that belongs to ``myprogram`` + +.. code-block:: lua + + pluginsConfig.ModuleExecutionDetector = { + myprog_id = { + moduleName = "myprogram", + }, + + trackExecution=true + } diff --git a/docs/src/Plugins/RawMonitor.rst b/docs/src/Plugins/RawMonitor.rst new file mode 100644 index 00000000..910029ed --- /dev/null +++ b/docs/src/Plugins/RawMonitor.rst @@ -0,0 +1,79 @@ +========== +RawMonitor +========== + +The ``RawMonitor`` plugin lets users specify via a custom instruction whenever a module of interest is loaded or +unloaded. It is useful when using S2E on a new OS for which there is no plugin that automatically extracts this +information. ``RawMonitor`` can also be used to analyze raw pieces of code, such as the BIOS, firmware, etc. + +Custom Instruction +------------------ + +``RawMonitor`` defines the following custom instruction (in ``s2e/monitors/raw.h``): + +.. code-block:: c + + void s2e_raw_load_module(const struct S2E_RAWMON_COMMAND_MODULE_LOAD *module); + +It takes as parameter a pointer to a structure that describes the loaded module. Use this function in your code to +manually specify module boundaries, for example: +: + +.. code-block:: c + + int main() { + struct S2E_RAWMON_COMMAND_MODULE_LOAD m; + m.name = (uintptr_t) "myprog"; + m.path = (uintptr_t) "/home/user/myprog"; + m.pid = getpid(); + m.load_base = ... ; /* the address where myprog is loaded */ + m.size = ... ; /* size of myprog */ + m.entry_point = 0; + m.native_base = 0; + m.kernel_mode = 0; + + s2e_raw_load_module(&m); + ... + } + +Options +------- + +The preferred way of using RawMonitor is through the ``s2e_raw_monitor*`` custom instructions, without specifying any +module descriptor in the configuration. The ``s2e.so`` shared library uses this mechanism to provide basic Linux +monitoring capabilities. + +RawMonitor also accepts global options and an arbitrary number of per-module sections. Per-module options are prefixed +with "module." in the documentation. This can be useful to monitor modules loaded at known fixed addresses (e.g., +kernel, BIOS, etc.). + +kernel_start=[address] + Indicates the boundary between the memory mapped in all address spaces and process-specific memory. On Linux, this + value is typically 0xC0000000, while one Windows it is 0x80000000. Set the value to zero if this distinction does + not make sense (e.g., there are no address spaces). + +module.name=["string"] + The name of the module. This must match the name passed to ``s2e_raw_load_module``. + +module.start=[address] + The run-time address of the module. Set to zero if the runtime address is determined by the custom instruction. + +module.size=[integer] + The size of the module binary. + +module.native_base=[address] + The default base address of the binary set by the linker. + +module.kernel=[true|false] + Whether the module lies above or below the kernel-mode threshold. Assumes that the module is mapped in all address + space at the same location above the kernel/user-space boundary. + +Configuration Sample +-------------------- + +.. code-block:: lua + + -- The custom instruction will notify RawMonitor of all newly loaded modules + pluginsConfig.RawMonitor = { + kernelStart = 0xc0000000, + } diff --git a/docs/src/Plugins/Tracers/ExecutionTracer.rst b/docs/src/Plugins/Tracers/ExecutionTracer.rst new file mode 100644 index 00000000..4da3f9ae --- /dev/null +++ b/docs/src/Plugins/Tracers/ExecutionTracer.rst @@ -0,0 +1,335 @@ +================= +Execution Tracers +================= + +S2E provides plugins to record execution traces. The main plugin is ``ExecutionTracer``. All other tracing plugins, such +as ``MemoryTracer`` or ``TranslationBlockTracer``, use ``ExecutionTracer`` in order to append various trace entries to +the trace file (e.g., a memory access or a translation block execution). ``ExecutionTracer`` gathers trace data from the +various plugins and outputs them to an ``ExecutionTracer.dat`` file in ``protobuf`` format. + + +.. note:: + + This document is a reference. For a more hands-on explanation of how to generate traces, please + read this `tutorial <../../Howtos/ExecutionTracers.rst>`__ first. + +ExecutionTracer +=============== + +An execution trace is a sequence of ``(magic, header_size, header, item_size, item)`` entries. +The magic number identifies the start of a trace entry. It is followed by the size of the header, the header, the +item size, then finally the actual item. + +The header is defined by the ``PbTraceItemHeader`` protobuf type. For more details, see the +`TraceEntries.proto `__ +file. + +.. code-block:: c + + message PbTraceItemHeader { + required uint32 state_id = 1; + required uint64 timestamp = 2; + required uint64 address_space = 3; + required uint64 pid = 4; + required uint64 pc = 5; + required PbTraceItemHeaderType type = 6; + } + +The header entry contains data that is common to all trace items. This includes the program id, the program counter, +and the address space (i.e., ``cr3`` register on x86) at the time a trace entry is created. The type field specifies +the type of the item that follows the header. The header also stores the state identifier to help reconstruct the +execution tree offline. + +The execution tracer ensures that it is possible to reconstruct the execution tree and extract individual paths when +S2E terminates. The tree can be decoded even when the trace recording is partial (e.g., in case S2E is forcefully +terminated). In order to do this, the execution tracer plugin monitors forks and records the id of the forked states in +the trace (see ``PbTraceItemFork``). Any subsequent trace item has the corresponding state id in its header. If two +items in the trace have the same state identifier, the item that happened earlier in the path is the one that comes +first in the trace. + + +ModuleTracer +============ + +This plugin traces process and module events. The following snippet is an example of a module load event. This indicates +that the OS loaded the shared library ``/lib64/ld-linux-x86-64.so.2`` into a process whose program identifier is 1245. +The event also records the location of each loaded section of the binary. The ``runtime_load_base`` of a section +is its start address in virtual memory as decided by the OS. The ``native_load_base`` is the address of the section +as specified by the linker. The two are often different because of ASLR or relocations. + +.. code-block:: json + + { + "address_space": 255299584, + "name": "ld-linux-x86-64.so.2", + "path": "/lib64/ld-linux-x86-64.so.2", + "pc": 18446744071580784396, + "pid": 1245, + "sections": [ + { + "executable": true, + "name": "", + "native_load_base": 0, + "readable": true, + "runtime_load_base": 140472821178368, + "size": 142304, + "writable": false + }, + { + "executable": false, + "name": "", + "native_load_base": 2243520, + "readable": true, + "runtime_load_base": 140472823421888, + "size": 5120, + "writable": true + } + ], + "state_id": 0, + "timestamp": 630430186029900, + "type": "TRACE_MOD_LOAD" + } + +``ModuleTracer`` is a fundamental plugin, as it helps resolve the raw program counters in the trace to actual +binary names. ``s2e-env`` uses this extensively to get file and line numbers when binaries have debug information. + + +TestCaseGenerator +================= + +The ``TestCaseGenerator`` has several functions: + +- Write test cases to ``debug.txt`` +- Store test case files in ``s2e-last`` +- Write test cases to ``ExecutionTracer.dat``. + +In this document, we will examine the latter. A test case entry has the following format: + +.. code-block:: json + + { + "address_space": 225800192, + "items": [ + { + "key": "v0___symfile____tmp_input___0_1_symfile___0", + "value": "AQEBAQEBAQEBAQEBAQE...." + } + ], + "pc": 134518939, + "pid": 1295, + "state_id": 1, + "timestamp": 630185690261719, + "type": "TRACE_TESTCASE" + } + +The most important field is ``items``: it records a list of (key, value) pairs. The key is the name of the symbolic +variable and the value is the concrete assignment to that variable. The variable name has the following format: + +.. code-block:: c + + vXXX_variablename_YYY + +``XXX`` is the relative order of the variable within the execution path. When a new variable is created, this +number is incremented by one. So the 10th variable of a state will have an id of 9. + +``variablename`` is the string passed to ``s2e_make_symbolic``. + +``YYY`` is the absolute order of the variable within a given S2E run. This number can be non-deterministic and +influenced by the sequence of state switches. + +These two identifiers ensure that each variable is globally unique and can be easily ordered when generating a test +case. + + +MemoryTracer +============ + +This plugin traces memory accesses, page faults, and TLB misses in the specified processes. It requires the following +configuration in ``s2e-config.lua``: + +.. code-block:: lua + + add_plugin("MemoryTracer") + + pluginsConfig.MemoryTracer = { + -- You can selectively enable/disable tracing various events + traceMemory = true, + tracePageFaults = true, + traceTlbMisses = true, + + -- This list specifies the modules to trace. + -- If this list is empty, MemoryTracer will trace all processes specified in ProcessExecutionDetector. + -- Modules specified here must run in the context of the process(es) defined in ProcessExecutionDetector. + moduleNames = { "test" } + } + +.. note:: + + ``MemoryTracer`` may produce large amounts of data (on the order of gigabytes), so make sure to restrict tracing + to the modules of interest. + +Here is an example of a memory access: + +.. code-block:: json + + { + "address": 140720832808700, + "address_space": 255279104, + "concrete_buffer": 0, + "flags": 1, + "host_address": 0, + "pc": 94739530127322, + "pid": 1251, + "size": 4, + "state_id": 0, + "timestamp": 630430187009925, + "type": "TRACE_MEMORY", + "value": 3735928559 + } + +- ``address``: the virtual address of the memory access +- ``value``: the concrete value written/read by the memory access +- ``size``: the size in bytes of the memory access +- ``flags``: this is a bitmask that indicates the type of the access + + - bit 0: set to 1 if the access is a write, read otherwise + - bit 1: set to 1 if the access is memory-mapped I/O. Note that due to how CPU emulation works, + normal RAM accesses may sometimes appear as MMIO. + - bit 2: indicates that the value is symbolic. In this case , the trace entry stores the concrete version + of the data. + - bit 3: the address is symbolic. In this case, the trace entry stores the concrete version of the symbolic address. + + Several other bits are available, please check ``TraceEntries.proto`` for more details. + +Several other fields are stored when ``traceHostAddresses`` is set in the configuration. These are useful under +rare circumstances when debugging the execution engine, e.g., to make sure that memory accesses get translated to +the right memory location on the host machine. + +- ``host_address``: this is the address of the access as mapped by QEMU when initializing KVM memory regions. +- ``concrete_buffer``: this is the final address of the access, after S2E translated the host address to the actual + per-state location. + + +A TLB miss looks as follows. It only stores the address and whether the access was a read or a write. + +.. code-block:: json + + { + "address": 140720832808704, + "address_space": 255279104, + "is_write": false, + "pc": 94739530127511, + "pid": 1251, + "state_id": 1, + "timestamp": 630430187323794, + "type": "TRACE_TLBMISS" + } + + +TranslationBlockTracer +====================== + +This plugin records the state of the CPU registers before and/or after a translation block is executed. +A translation block is a sequence of guest instructions that ends in a control flow change. + +.. code-block:: lua + + add_plugin("MemoryTracer") + + pluginsConfig.TranslationBlockTracer = { + -- In general, the CPU state at the beginning of a translation block is equal + -- to the state at the end of the previous translation block. + -- In cases where a translation block is interrupted because of an exception, the end state + -- may not be recorded. + traceTbStart = true, + traceTbEnd = false, + moduleNames = {"test"} + } + +Here is an example of a trace entry for translation blocks for 64-bit x86 code: + +.. code-block:: json + + { + "address_space": 255324160, + "data": { + "first_pc": 93907771242192, + "last_pc": 93907771242228, + "size": 42, + "tb_type": "TB_CALL_IND" + }, + "pc": 93907771242192, + "pid": 1253, + "regs": { + "values": [ + 28, + 140723069759552, + 139806606502816, + 0, + 140723069759520, + 0, + 2, + 139806608687472, + 139806608688896, + 0, + 8, + 140723070026140, + 93907771242192, + 140723069759520, + 0, + 0 + ], + "symb_mask": 0 + }, + "state_id": 0, + "timestamp": 631227908208916, + "type": "TRACE_TB_START" + } + +There are several fields specific to this type of trace entry: + +- ``tb_type``: the type of the translation block. It is defined by the last instruction of the block + (e.g., direct call/jump, indirect call/jump, system call, etc.). +- ``regs``: the register data: + + - ``values``: concrete content of the CPU registers. + - ``symb_mask``: this is a bitmask that indicates which register contains symbolic data. In case a register is + symbolic, the ``values`` field contains a concrete value that satisfies path constraints at the time of + the recording. Note that symbolic data is not recorded by the plugin. + +- ``size``: the size in bytes of the guest instructions contained in this translation block. +- ``first_pc``: the program counter of the first instruction in the translation block. +- ``last_pc``: the program counter of the last instruction in the translation block. + + +InstructionTracer +================= + +This plugin counts how many instructions have been executed in the configured processes or modules and writes +the count to the execution trace. The plugin keeps a per-path count and writes it when the path terminates. + +.. code-block:: lua + + add_plugin("InstructionCounter") + pluginsConfig.InstructionCounter = { + -- This list specifies the modules to trace. + -- If this list is empty, MemoryTracer will trace all processes specified in ProcessExecutionDetector. + -- Modules specified here must run in the context of the process(es) defined in ProcessExecutionDetector. + moduleNames = {"test"} + } + + +Here is a sample trace entry: + + +.. code-block:: json + + { + "address_space": 232222720, + "count": 191951636, + "pc": 134518939, + "pid": 1255, + "state_id": 0, + "timestamp": 631227908821908, + "type": "TRACE_ICOUNT" + } diff --git a/docs/src/Plugins/Windows/WindowsMonitor.rst b/docs/src/Plugins/Windows/WindowsMonitor.rst new file mode 100644 index 00000000..88ceba82 --- /dev/null +++ b/docs/src/Plugins/Windows/WindowsMonitor.rst @@ -0,0 +1,10 @@ +============== +WindowsMonitor +============== + +The ``WindowsMonitor`` plugin implements the detection of module and process loads/unloads on the Windows operating +system. It can be referred to as ``OSMonitor`` by other plugins. The plugin catches the invocation of specific kernel +functions to detect these events. + +Options +------- diff --git a/docs/src/Profiling/ProfilingS2E.rst b/docs/src/Profiling/ProfilingS2E.rst new file mode 100644 index 00000000..bd5b35b3 --- /dev/null +++ b/docs/src/Profiling/ProfilingS2E.rst @@ -0,0 +1,89 @@ +============= +Profiling S2E +============= + +This page explains how to profile memory and CPU performance. + + +Profiling memory usage +====================== + +We are going to use `heaptrack `_. Its main advantage over competing profilers +is that it is possible to inject it in a running S2E process. Others require using ``LD_PRELOAD``, which interferes +with ``libs2e.so`` and causes crashes. The disadvantage is that you cannot inject it before S2E starts. + +1. Build ``heaptrack`` from source. Avoid using the one that ships with Ubuntu 18.04, it may not be as reliable. + +2. Start S2E + +3. Inject ``heaptrack`` into the running S2E instance: + + .. code-block:: bash + + $ ps aux | grep qemu + ubuntu 31044 96.7 4.7 3487216 1191460 pts/8 Sl 13:54 0:03 /home/ubuntu/s2e/env/install/bin/qemu-system-i386 -drive file=/home/ubuntu/s2e/env/images/debian-9.2.1-i386/image.raw.s2e,format=s2e,cache=writeback -k en-us -nographic -monitor null -m 256M -enable-kvm -serial file:serial.txt -net none -net nic,model=e1000 -loadvm ready + + $ heaptrack -p 31044 + heaptrack output will be written to "/home/ubuntu/heaptrack/build/heaptrack.qemu-system-i38.31044.gz" + injecting heaptrack into application via GDB, this might take some time... + injection finished + ... + +4. Wait for S2E to terminate or kill it manually. You will see the following output: + + .. code-block:: bash + + heaptrack stats: + allocations: 6836389 + leaked allocations: 46722 + temporary allocations: 1249045 + removing heaptrack injection via GDB, this might take some time... + ptrace: No such process. + No symbol table is loaded. Use the "file" command. + The program is not being run. + Heaptrack finished! Now run the following to investigate the data: + + heaptrack --analyze "/home/ubuntu/heaptrack/build/heaptrack.qemu-system-i38.31044.gz" + + +5. Open the resulting file in the ``heaptrack`` GUI + + .. code-block:: bash + + $ heaptrack_gui "/home/ubuntu/heaptrack/build/heaptrack.qemu-system-i38.31044.gz" + + +.. image:: ./heaptrack.png + + +Profiling CPU performance +========================= + +We are going to use `hotspot `_. It is a convenient GUI wrapper around the Linux +`perf` tool. + +1. Build ``hotspot`` from source + +2. Start ``hotspot`` + + .. code-block:: bash + + $ hotspot + + +.. image:: ./hotspot1.png + + +3. Start S2E + +4. Start profiling + + - Click on ``Record Data`` + - Select ``Attach To Process(es)`` + - Select the S2E instance you want to profile + - Click ``Start Recording`` + +5. Stop recording and view results + +.. image:: ./hotspot2.png + diff --git a/docs/src/Profiling/heaptrack.png b/docs/src/Profiling/heaptrack.png new file mode 100644 index 00000000..5c594833 Binary files /dev/null and b/docs/src/Profiling/heaptrack.png differ diff --git a/docs/src/Profiling/hotspot1.png b/docs/src/Profiling/hotspot1.png new file mode 100644 index 00000000..cb147f72 Binary files /dev/null and b/docs/src/Profiling/hotspot1.png differ diff --git a/docs/src/Profiling/hotspot2.png b/docs/src/Profiling/hotspot2.png new file mode 100644 index 00000000..d1984460 Binary files /dev/null and b/docs/src/Profiling/hotspot2.png differ diff --git a/docs/src/StateMerging.rst b/docs/src/StateMerging.rst new file mode 100644 index 00000000..8f81526e --- /dev/null +++ b/docs/src/StateMerging.rst @@ -0,0 +1,137 @@ +=============================================== +Exponential Analysis Speedup with State Merging +=============================================== + +Symbolic execution can produce an exponential number of paths, considerably slowing down analysis. When S2E encounters +a branch that depends on a symbolic condition and both outcomes are possible, S2E forks the current execution path in +two. This process can repeat recursively, resulting in an exponential number of paths. + +The following piece of code demonstrates the problem. It is a simplification of the ``ShiftInBits()`` function of the +Intel e100bex NIC driver from the Windows WDK. It consists of a loop that reads a value from a hardware register +bit-by-bit. + +.. code-block:: c + + uint16_t ShiftInBits() { + uint16_t value = 0; + + for (int i = 0; i < sizeof(value) * 8; ++i) { + value <<= 1; + + if (read_register()) { + value |= 1; + } + } + + return value; + } + +On each iteration, ``read_register()`` returns a fresh symbolic value, causing a fork at the conditional statement. +Since there are 16 iterations in total, this amounts to 65,536 execution states. + +If we look closely, every forked path in the function above differs only by one bit, set to zero or one depending on +the register value. If S2E could merge both paths back together while remembering that small difference, there would +remain only one path at the end of the function, reducing by four orders of magnitude the number of paths to explore. + +Using State Merging in S2E +========================== + +To use state merging in S2E, first enable the ``MergingSearcher`` plugin. + +.. code-block:: lua + + -- File: config.lua + s2e = { + kleeArgs = { + -- needed to avoid merge failures due to different shared-concrete objects: + "--state-shared-memory=true" + } + } + + plugins = { + "BaseInstructions", + "MergingSearcher" + } + +Then, compile the following program, then run it in S2E: + +.. code-block:: c + + #include + + uint16_t ShiftInBits() { + uint16_t value = 0; + int i; + + for (i = 0; i < sizeof(value) * 8; ++i) { + value <<= 1; + + /* Simulates read_register() */ + uint8_t reg = 0; + s2e_make_symbolic(®, sizeof(reg), "reg"); + + s2e_disable_all_apic_interrupts(); + s2e_merge_group_begin(); + + if (reg) { + value |= 1; + } + + s2e_merge_group_end(); + s2e_enable_all_apic_interrupts(); + } + + return value; + } + + int main(int argc, char **argv) { + uint16_t value = ShiftInBits(); + if (value == 0xabcd) { + s2e_printf("found it\n"); + } + + return 0; + } + +* How many paths do you observe? +* Comment out calls to ``s2e_merge_group_begin()`` and ``s2e_merge_group_end()``. How does this affect the number of + paths? + +State Merging API +================= + +The S2E state merging API offers two calls: ``s2e_merge_group_begin()`` and ``s2e_merge_group_end()``. + +The subtree that begins at ``s2e_merge_group_begin()`` and whose leaves end at ``s2e_merge_group_end()`` is merged into +one path. The ``MergingSearcher`` behaves as follows: + +The searcher suspends the first path (path A) that reaches ``s2e_merge_group_begin()``. + +* If path A did not fork any other path between ``s2e_merge_group_begin()`` and ``s2e_merge_group_end()``, there is + nothing to merge, and the searcher resumes path A normally. + +* If path A forked other paths (e.g., B and C), the searcher schedules another path. The scheduled path could be B, C, + or any other path outside the subtree to be merged. + +* When path B reaches ``s2e_merge_group_end()``, ``MergingSearcher`` merges it with A, then kills B. + +* When path C reaches ``s2e_merge_group_end()``, ``MergingSearcher`` merges it with A+B, then kills C. + +Limitations +=========== + +* It is not possible to nest pairs of ``s2e_merge_group_begin()`` and ``s2e_merge_group_end()``. + +* S2E must be running in concrete mode when merging states (``s2e_merge_group_end()`` ensures that it is the case). + +* The set of symbolic memory objects must be identical in all states that are going to be merged. For example, there + shouldn't be calls to ``s2e_make_symbolic`` between ``s2e_merge_group_begin()`` and ``s2e_merge_group_end()``. + +* It is not possible to merge two states if their concrete CPU state differs (e.g., floating point or MMX registers, + program counter, etc.). + +* ``s2e_disable_all_apic_interrupts()`` and ``s2e_enable_all_apic_interrupts()`` ensure that the concrete state is not + clobbered needlessly by interrupts. The direct consequence is that the merged subtree cannot call into the + environment (no syscalls, etc.). Not disabling interrupts will make merging much harder because the side effects of + the interrupt handlers and those of the OS will have to be merged as well. If the side effects affected the concrete CPU state, + merging will fail. diff --git a/docs/src/Testsuite.rst b/docs/src/Testsuite.rst new file mode 100644 index 00000000..3672fa86 --- /dev/null +++ b/docs/src/Testsuite.rst @@ -0,0 +1,158 @@ +============= +S2E Testsuite +============= + +In this document, you will learn how to build and run the S2E testsuite. The testsuite is a collection of programs that +help test various aspects of the S2E engine. It also serves as a reference for various S2E tutorials. A test is +comprised of one or more program binaries, a script that sets up an S2E analysis project, runs it, and then checks that +the results are correct. + + +Building the testsuite +====================== + +Before proceeding, set up your S2E environment using ``s2e-env``. Once it is up and running (i.e., you can create +analysis projects and run them), you are ready to build and run the testsuite: + +.. code-block:: bash + + s2e testsuite generate + +This creates test projects named ``testsuite_*`` in the ``projects`` folder. One test may have several different +projects, each project running the test in a particular configuration. For example, a test that checks that a Windows +32-bit binary runs properly may want to do so on say three different versions of Windows, resulting in three separate +projects. + +The ``s2e testsuite generate`` command also accepts a list of test names. If one or more test names are specified, the +command generates test projects only for those tests. This may be useful to make it faster when you want to generate and +run only one or two tests, e.g., when you write your own test and want to debug it. + +You can view the list of available tests with the following command: + +.. code-block:: bash + + s2e testsuite list + +.. note:: + + Some tests require a Windows build server. This server must be set up as described `here `_ and + enabled in the ``s2e.yaml`` file located at the root of your S2E environment. If you do not want to build the + server, you may skip the tests that require it as follows: + + .. code-block:: bash + + s2e testsuite generate --no-windows-build + +.. warning:: + + Testsuite generation recreates test projects from scratch and overwrites previously generated tests. Avoid + modifying generated tests directly and modify the testsuite source instead (see later for details). + + +Running the testsuite +===================== + +There are two ways to run the testsuite: using ``s2e testsuite run`` or invoking the ``run-tests`` script. + +1. Using the ``s2e`` command. + + .. code-block:: bash + + s2e testsuite run + + This command automatically starts the optimal number of parallel tests depending on how many + CPU cores and memory the machine has. + + You may also specify one or more test project names, in which case the command will + run only those tests. + + This command saves the console output of all tests in ``stdout.txt`` and ``stderr.txt`` files in the test project's + directory. + +2. Manually invoking the ``run-tests`` script in the test project's folder. + This is useful when you want to integrate tests in your own testing environment (e.g., Jenkins). + The script returns a non-zero status code if the test fails. Additionally, + the script writes the ``status`` file in the ``s2e-last`` folder containing either + ``SUCCESS`` or ``FAILURE``. + + You can use GNU parallel to run multiple tests at once: + + .. code-block:: bash + + S2EDIR=$(pwd) parallel -j5 ::: projects/testsuite_*/run-tests + + +.. note:: + + A test project contains all the usual S2E configuration files and scripts. + For example, you can run the project with ``s2e run`` or ``launch-s2e.sh``. However, this will just + run S2E and will not check the output. Make sure to use ``run-tests`` or ``s2e testsuite run`` + instead. + + +Adding your own tests +===================== + +The testsuite is located in the `testsuite `__ repository. +In order to add a test, follow these steps: + +1. Create a subdirectory named after the test. + +2. Create a makefile. It must have an ``all`` target that builds the binaries. + +3. Create a ``config.yml`` file that describes the tests. See the reference section for details. + +4. Create a ``run-tests.tpl`` file that launches the test project and checks the output after S2E terminates. + This is a Jinja template that ``s2e testsuite generate`` instantiates into ``run-tests`` and places + in the project's directory. This file would typically start with the following lines: + + .. code-block:: bash + + #!/bin/bash + + {% include 'common-run.sh.tpl' %} + + s2e run -n {{ project_name }} + + + ``common-run.sh.tpl`` contains various helper functions and variables that the ``run-tests`` can use + to check the test results. + +.. note:: + + You may write ``run-tests.tpl`` in any language (e.g. Python). + Just make sure that it checks for ``S2EDIR``, sets the proper exit code on failure and creates + the ``status`` file as appropriate. + + +Test configuration reference +============================ + +This section describes the contents of the ``config.yml`` file. + +- **description**: a string describing the purpose of the test. It is displayed by ``s2e testsuite list`` + +- **targets**: a list of binaries to be tested. These binaries are produced by the makefile. + Each entry is a path relative to the test folder. + +- **target_arguments**: a list of parameters to give to the binary. These are the same parameters passed + to the binary in the ``s2e new_project`` command. Typically, the argument is ``@@`` to allow symbolic + input files. + +- **options**: a list of parameters to be passed to ``s2e new_project``. In general, these are usual parameters with + leading dashes stripped and others converted to underscores, e.g., ``--enable-pov-generation`` + becomes ``enable_pov_generation: true``. + +- **build-options**: a list of options that control test project generation. + + - **windows-build-server**: when set to true, indicates that the test requires a Windows build server + to create binaries. + + - **post-project-generation-script**: path to a script that is ran after ``s2e new_project`` is called. + You can use this script to customize the project configuration. + +- **target-images**: a list of images to use for the tests. When this option is missing, the test + generator creates a project for every usable image, unless it is blacklisted. + +- **blacklisted-images**: list of images for which to not create tests. + This is useful in case a binary is incompatible with a specific OS version. diff --git a/docs/src/Tools/ExecutionProfiler.rst b/docs/src/Tools/ExecutionProfiler.rst new file mode 100644 index 00000000..66c751a6 --- /dev/null +++ b/docs/src/Tools/ExecutionProfiler.rst @@ -0,0 +1,21 @@ +================== +Execution Profiler +================== + +The execution profiler tool outputs various metrics about the execution, like the number of executed instructions, +page faults, TLB misses, cache profile, etc. + +Examples +-------- + +Required Plugins +---------------- + +* ``ExecutionTracer`` +* ``CacheSim`` +* ``InstructionCounter`` + +Optional Plugins +---------------- + +* ``ModuleTracer`` (for debug information) diff --git a/docs/src/Tools/ForkProfiler.rst b/docs/src/Tools/ForkProfiler.rst new file mode 100644 index 00000000..7f8b1303 --- /dev/null +++ b/docs/src/Tools/ForkProfiler.rst @@ -0,0 +1,42 @@ +=============================================== +Debugging path explosion with the fork profiler +=============================================== + +The fork profile gives you a summary of all the locations in the system that forked. +This helps debug sources of path explosion. + +In order to obtain a fork profile, run the `forkprofile` subcommand as follows: + +.. code-block:: console + + $ s2e new_project /home/users/coreutils-8.26/build/bin/cat -T @@ + $ ... launch s2e and let it run for a while ... + + $ s2e forkprofile cat + INFO: [symbols] Looking for debug information for /home/vitaly/s2e/env/projects/cat-symb/././cat + INFO: [forkprofile] # The fork profile shows all the program counters where execution forked: + INFO: [forkprofile] # process_pid module_path:address fork_count source_file:line_number (function_name) + INFO: [forkprofile] 01309 cat:0x08049ade 143 /home/user/coreutils-8.26/src/cat.c:483 (cat) + INFO: [forkprofile] 01309 cat:0x08049b0a 81 /home/user/coreutils-8.26/src/cat.c:488 (cat) + INFO: [forkprofile] 01309 cat:0x08049981 73 /home/user/coreutils-8.26/src/cat.c:410 (cat) + + +The trace above shows that the `cat` module (which has a pid 1309) forked at three different program counters. +The source information (if available) gives the source file, the line, and the function name where the forks occurred. + +Using the fork profile to debug path explosion +============================================== + +1. Spot locations that should not fork. + For example, your program might write something to the console. This typically results in forks in the + OS's console driver. The source information will point to the responsible linux kernel driver, allowing + you to quickly fix your bootstrap script (e.g., by redirecting output to `/dev/null` or a symbolic file). + +2. Identify library functions that can be optimized. + You will immediately see if the program forks in functions such as `strlen` or `atoi`. These functions can + be replaced with `function models <../Plugins/Linux/FunctionModels.rst>`__ that eliminate forks altogether, although + at the expense of generating more complex expressions. + +3. Identify sections of code that can benefit of `state merging <../StateMerging.rst>`__. + Certain types of code sections are hard to model and can be instead surrounded by `s2e_merge_group_begin()` and + `s2e_merge_group_end()` API calls, which will merge into one state the subtree in between these two calls. diff --git a/docs/src/Tools/TbPrinter.rst b/docs/src/Tools/TbPrinter.rst new file mode 100644 index 00000000..afa028c0 --- /dev/null +++ b/docs/src/Tools/TbPrinter.rst @@ -0,0 +1,41 @@ +============= +Trace Printer +============= + +The trace printer tool outputs for each specified path all the trace items that were collected in these states. Items +include memory accesses, executed translation blocks, or test cases. This tool is useful to quickly observe the +execution sequence that led to a particular event that caused a state to terminate. It can also be useful for +debugging. + +Examples +-------- + +A complete tutorial on how to generate and display a trace can be found `here <../Howtos/ExecutionTracers.rst>`__. + +Assuming you have obtained a trace in ``ExecutionTracer.dat``, the following command outputs the translation block +trace for paths 0 and 34. Omitting the ``-pathId`` option will cause the command to output the trace for all paths. + +If the ``-printRegisters`` option is specified, the command also prints the contents of the registers before and after +the execution of each translation block (provided that ``TranslationBlockTracer`` was enabled). + +``-printMemory`` also shows all memory accesses (provided that ``MemoryTracer`` was enabled). + +.. code-block:: console + + $S2EDIR/build-s2e/tools/Release+Asserts/bin/tbtrace -trace=s2e-last/ExecutionTracer.dat \ + -outputdir=s2e-last/traces -pathId=0 -pathId=34 -printMemory + +Required Plugins +---------------- + +* ``ModuleExecutionDetector`` (only the translation blocks of those modules that are configured will be traced) +* ``ExecutionTracer`` +* ``TranslationBlockTracer`` +* ``ModuleTracer`` (for module information) + +Optional Plugins +---------------- + +* ``TestCaseGenerator`` (for test cases) +* ``MemoryTracer`` (for memory traces) +* ``TranslationBlockTracer`` (for executed translation blocks) diff --git a/docs/src/Tutorials/BasicLinuxSymbex/SourceCode.rst b/docs/src/Tutorials/BasicLinuxSymbex/SourceCode.rst new file mode 100644 index 00000000..b8bee0e3 --- /dev/null +++ b/docs/src/Tutorials/BasicLinuxSymbex/SourceCode.rst @@ -0,0 +1,221 @@ +========================================= +Instrumenting Program Source Code for S2E +========================================= + +`s2e-env <../../s2e-env.rst>`__ and `s2e.so `__ automate most of the tedious tasks of setting up virtual +machines, S2E configuration files, launch scripts, etc. This tutorial assumes that you have already built S2E and +prepared a VM image as described on the `Building the S2E Platform <../../BuildingS2E.rst>`__ page, that you are +familiar with `s2e-env <../../s2e-env.rst>`__, and that you tried symbolic execution on simple `Linux binaries +`__. + +In this tutorial, you will learn in more details what happens under the hood of `s2e-env <../../s2e-env.rst>`__ and +`s2e.so `__, in particular how they actually make input symbolic, and how you can do the same for arbitrary +programs. This may be useful on targets and platforms that are not supported yet by S2E's project creation scripts. We +advise you to study how these tools work and adapt them to your own needs. Don't hesitate to look at their source code. + +.. contents:: + +Introduction +============ + +We want to cover all of the code of the following program by exploring all the possible paths through it. + +.. code-block:: c + + #include + #include + + int main(void) { + char str[3]; + + printf("Enter two characters: "); + + if (!fgets(str, sizeof(str), stdin)) { + return 1; + } + + if (str[0] == '\n' || str[1] == '\n') { + printf("Not enough characters\n"); + } else { + if (str[0] >= 'a' && str[0] <= 'z') { + printf("First char is lowercase\n"); + } else { + printf("First char is not lowercase\n"); + } + + if (str[0] >= '0' && str[0] <= '9') { + printf("First char is a digit\n"); + } else { + printf("First char is not a digit\n"); + } + + if (str[0] == str[1]) { + printf("First and second chars are the same\n"); + } else { + printf("First and second chars are not the same\n"); + } + } + + return 0; + } + + +Compiling and running +===================== + +First, compile and run the program as usual, to make sure that it works. + +.. code-block:: bash + + $ gcc -m32 -O3 tutorial1.c -o tutorial1 + $ ./tutorial1 + Enter two characters: ab + First char is lowercase + First char is not a digit + First and second chars are not the same + +Then, create a new project using `s2e-env <../../s2e-env.rst>`__ . Once it is created, run it, and check that it runs +properly in the guest. You will need to run S2E with graphics output enabled so that you can type the input. + +.. code-block:: bash + + s2e new_project /path/to/tutorial1 + + +Preparing the program for symbolic execution +============================================ + +In order to execute the program symbolically, it is necessary to specify what values should become symbolic. There are +many ways to do it in S2E, but the simplest one is to use the S2E opcodes library. This library provides a way for guest +code to communicate with the S2E system. + +In order to explore all possible paths through the program that correspond to all possible inputs, we want to make these +inputs symbolic. To accomplish this, we replace the call to ``fgets()`` by a call to ``s2e_make_symbolic()``: + +.. code-block:: c + + ... + char str[3]; + // printf("Enter two characters: "); + // if(!fgets(str, sizeof(str), stdin)) + // return 1; + s2e_make_symbolic(str, 2, "str"); + str[3] = 0; + ... + +Finally, it would be interesting to see an example of input value that cause a program to take a particular execution +path. This can be useful to reproduce a bug in a debugger, independently of S2E. For that, use the ``s2e_get_example()`` +function. This function gives a concrete example of symbolic values that satisfy the current path constraints (i.e., all +branch conditions along the execution path). + +After these changes, the example program looks as follows: + +.. code-block:: c + + #include + #include + #include + + int main(void) { + char str[3] = { 0 }; + + // printf("Enter two characters: "); + // if (!fgets(str, sizeof(str), stdin)) { + // return 1; + // } + + s2e_make_symbolic(str, 2, "str"); + + if (str[0] == '\n' || str[1] == '\n') { + printf("Not enough characters\n"); + } else { + if (str[0] >= 'a' && str[0] <= 'z') { + printf("First char is lowercase\n"); + } else { + printf("First char is not lowercase\n"); + } + + if (str[0] >= '0' && str[0] <= '9') { + printf("First char is a digit\n"); + } else { + printf("First char is not a digit\n"); + } + + if (str[0] == str[1]) { + printf("First and second chars are the same\n"); + } else { + printf("First and second chars are not the same\n"); + } + } + + s2e_get_example(str, 2); + printf("'%c%c' %02x %02x\n", str[0], str[1], + (unsigned char) str[0], (unsigned char) str[1]); + + return 0; + } + +.. note:: + + There are alternatives to ``s2e_get_example`` to get test cases. The simplest one is using the ``TestCaseGenerator`` + plugin, which is enabled by default, and outputs test cases in ``s2e-last/debug.txt``. + + +Compile the program and try to run it on your host: + +.. code-block:: bash + + $ gcc -I$S2ESRC/guest/common/include -O3 tutorial1.c -o tutorial1 + $ ./tutorial1 + Illegal instruction + +You see the ``Illegal instruction`` message because all ``s2e_*`` functions use +special CPU opcodes that are only recognized by S2E. + +Running the program in S2E +========================== + +Now rerun the program above in S2E, using the launch scripts generated by ``s2e-env``. You should see several states +forked, one for each possible program input. Each state is a completely independent snapshot of the whole system. You +can even interact with each state independently, for example by launching different programs. Try to launch +``tutorial1`` in one of the states again! + +In the host terminal (i.e., the S2E standard output), you see various information about state execution, forking and +switching. This output is also saved into the ``s2e-last/debug.txt`` log file. As an exercise, try to follow the +execution history of a state through the log file. + +Terminating execution paths +=========================== + +By default, S2E runs paths forever and needs a special order in order to terminate an execution path. The ``s2e-env`` +tool wraps programs in a script that will take care of terminating paths when the program returns or when it crashes. +Sometimes, you may want to terminate the execution path yourself, directly from your program. This is particularly +useful if you run S2E on a system that is not yet supported by ``s2e-env``, such as embedded OSes. + +Terminating an execution path is accomplished with the ``s2e_kill_state()`` function. A call to this function +immediately stops the execution of the current path and exits S2E if there are no more paths to explore. Add a call to +this function just before the program returns control to the OS. Before that, you may want to print example values in +the S2E log using ``s2e_printf()``: + +.. code-block:: c + + int main(void) + { + char str[3] = { 0 }; + + ... + + s2e_get_example(str, 2); + s2e_printf("'%c%c' %02x %02x\n", str[0], str[1], (unsigned char) str[0], (unsigned char) str[1]); + s2e_kill_state(0, "program terminated"); + + return 0; + } + +When you rerun the program, you will see that the logs contain the message ``program terminated``. + +You can also terminate the execution from a script, using the ``s2ecmd`` tool. + +.. code-block:: bash + + guest$ ./tutorial; ./s2ecmd kill 0 "done" diff --git a/docs/src/Tutorials/BasicLinuxSymbex/s2e.so.rst b/docs/src/Tutorials/BasicLinuxSymbex/s2e.so.rst new file mode 100644 index 00000000..5fc0110e --- /dev/null +++ b/docs/src/Tutorials/BasicLinuxSymbex/s2e.so.rst @@ -0,0 +1,184 @@ +==================================== +Symbolic Execution of Linux Binaries +==================================== + +In this tutorial, we will show how to symbolically execute Linux *binaries*, *without* +modifying their source code. Before you start, make sure that you have a working S2E environment. + + +Getting started +--------------- + +We will symbolically execute the ``echo`` utility. For this, create a new analysis project as follows: + +.. code-block:: bash + + # $S2EENV is the root of your S2E environment created with s2e init + s2e new_project -i debian-9.2.1-i386 $S2EENV/images/debian-9.2.1-i386/guestfs/bin/echo abc + +This command creates a new analysis project that invokes ``echo`` with one parameter ``abc``, which would normally print +``abc`` on the standard output. + +Note that this command uses the ``echo`` binary from the guest VM image as the analysis +target. Do not use ``/bin/echo`` as the target, as this will take the one from your host OS, which may not be compatible +with the VM image that S2E uses. + +Finally, we must explicitly pass the image name using ``-i``, because we ``s2e new_project`` may decide to use +a 64-bit image to run the binary, because it is in theory possible to run a 32-bit binary on a 64-bit guest. However, +this could also fail because ``echo`` might use libraries specific to the 32-bit guest. + + +Using ``s2e.so`` +---------------- + +In this section, we show how to make command line arguments symbolic. Open the ``bootstrap.sh`` file and locate the +following line: + +.. code-block:: bash + + S2E_SYM_ARGS="" LD_PRELOAD=./s2e.so ./${TARGET} abc > /dev/null 2> /dev/null + +Modify this line as follows: + +.. code-block:: bash + + S2E_SYM_ARGS="1" LD_PRELOAD=./s2e.so ./${TARGET} abc > /dev/null 2> /dev/null + +This makes argument 1 (``abc``) symbolic. The process works like this: + +1. The ``s2e.so`` library is preloaded in the binary using ``LD_PRELOAD``. +2. ``s2e.so`` reads from the ``S2E_SYM_ARGS`` environment variable which arguments to make symbolic. +3. If the variable is missing, ``s2e.so`` leaves all arguments concrete and proceeds with normal execution. +4. If not, ``s2e.so`` overwrites the specified arguments with symbolic values. It is possible to make only some + arguments symbolic and leave others concrete by specifying the corresponding argument IDs. + + .. code-block:: bash + + S2E_SYM_ARGS=" .. " # Mark argument as symbolic + + +You may have noticed ``> /dev/null 2> /dev/null`` at the end of the command. This avoids printing symbolic characters on +the screen and eliminates forks in the kernel. There are some other tricks that ``s2e-env`` enables in ``bootstrap.sh`` +in order to minimize unwanted forks: + +* Do not print crashes in the syslog with ``sudo sysctl -w debug.exception-trace=0`` +* Prevent core dumps from being created with ``ulimit -c 0`` (you may want to re-enable them if needed) + + +.. warning:: + + You **must** specify default concrete arguments, so that ``s2e.so`` can overwrite them with symbolic data. + The following command will not work because there is no argument to make symbolic (``abc`` is missing). + + .. code-block:: bash + + S2E_SYM_ARGS="1" LD_PRELOAD=./s2e.so ./${TARGET} > /dev/null 2> /dev/null + +.. warning:: + + You cannot make the content of a file symbolic by just marking the file name symbolic. In other words, the + following will not have the intended consequence: + + .. code-block:: bash + + S2E_SYM_ARGS="1" LD_PRELOAD=./s2e.so /bin/cat /path/to/myfile + + Instead of making the **content** of ``/path/to/myfile`` symbolic, it makes the **file name** itself symbolic. + The next section explains how to make the content of the file symbolic. + +.. warning:: + + Your binary **must** be dynamically linked, otherwise you cannot use ``s2e.so``. In case you want to make + arguments symbolic for a statically-linked binary, see workarounds below. + + +What about other symbolic input? +-------------------------------- + +You can also feed symbolic data to your program through ``stdin`` or symbolic files. + +For the ``stdin`` method, the idea is to pipe the symbolic output of one program to the input of another. Symbolic +output can be generated using the ``s2ecmd`` utility. The command below passes four symbolic bytes to ``cat``: + +.. code-block:: bash + + ./s2ecmd symbwrite 4 | cat + +If your binary is statically linked, you could pass it symbolic arguments as follows: + +.. code-block:: bash + + /bin/echo $(./s2ecmd symbwrite 4) + +Note that this may be much slower than using ``s2e.so`` as symbolic data has to go through several layers of OS and +libraries before reaching the target binary. + +If your binary takes a file name as a parameter and you want the content of that file to be symbolic, the simplest is to +create your analysis project as follows: + +.. code-block:: bash + + # The @@ is a placeholder for a concrete file name that contains symbolic data + s2e new_project -i debian-9.2.1-i386 $S2EENV/images/debian-9.2.1-i386/guestfs/bin/cat @@ + +This generates a bootstrap file that creates a symbolic file in ramdisk (i.e., in ``/tmp`` on Linux), writes +some symbolic data to that file, and passes the path to that file to ``cat``. The symbolic file must be stored in RAM +(hence the ramdisk, or tmpfs). Writing symbolic data to a hard drive will concretize it. + +.. note:: + + In case of ``cat``, you may not see any forks with the command above, as the standard output is redirected + to ``/dev/null`` and the symbolic data is therefore never branched upon. You must tweak the command line + according to the aspects of the binary you want to test. + + +Configuring S2E for use with ``s2e.so`` +--------------------------------------- + +``s2e-env`` automatically configures all plugins required to use ``s2e.so``. Read this section if you want to know +more about the configuration. You do not need to worry about this during normal use and can skip the rest of this +tutorial. + +``s2e.so`` requires two plugins to operate: ``BaseInstructions`` and ``LinuxMonitor``. The first provides general +infrastructure to communicate with plugins, while the second keeps track of various OS-level events (e.g., process +loads or thread creation). The S2E configuration file must contain default settings for these +plugins, as follows: + + +.. code-block:: lua + + plugins = { + -- Enable S2E custom opcodes + "BaseInstructions", + + -- Track when the guest loads programs + "LinuxMonitor", + } + + +Besides making command line arguments symbolic, ``s2e.so`` also reads ``/proc/self/maps`` to figure out which shared +libraries are loaded by the process and communicates their location to ``LinuxMonitor``. ``LinuxMonitor`` then +broadcast this information to any interested plugins. For example, the code coverage plugin uses this information +to map program counters to a module name. + +.. warning:: + + There is no ``s2e.so`` for Windows yet. In order to make program arguments symbolic, you must modify the + source code manually. However, writing symbolic data to the standard input or to the ramdisk works like on Linux. + On Windows, programs and shared libraries are tracked by a special guest driver, ``s2e.sys``, that communicates with + ``WindowsMonitor``. + + +Modifying and building ``s2e.so`` +--------------------------------- + +If you use ``s2e-env`` and stock VM images, ``s2e.so`` is automatically copied into the guest VM each time +you start the analysis. You do not have to do anything special unless you want to modify it. + +The ``s2e.so`` library source can be found in the ``guest`` folder of the S2E source directory and is built during the +S2E build process. It can also be built manually by running ``make -f $S2ESRC/Makefile guest-tools-install`` from the +build directory. This creates ``guest-tools32`` and ``guest-tools64`` in ``$S2EDIR/build/$S2E_PREFIX/bin`` (by default +``$S2E_PREFIX`` is equal to ``opt``). + +The latest build of ``s2e.so`` is copied in your guest VM next time you start the analysis, so all you need is to run +the ``make`` command above if you modify the source code of ``s2e.so``. diff --git a/docs/src/Tutorials/PoV/arch.odg b/docs/src/Tutorials/PoV/arch.odg new file mode 100644 index 00000000..6f653dd1 Binary files /dev/null and b/docs/src/Tutorials/PoV/arch.odg differ diff --git a/docs/src/Tutorials/PoV/arch.svg b/docs/src/Tutorials/PoV/arch.svg new file mode 100644 index 00000000..2b32a765 --- /dev/null +++ b/docs/src/Tutorials/PoV/arch.svg @@ -0,0 +1,366 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + BaseInstructionsProvides mechanisms for guest code to communicate with S2E plugins. + + + + + + + + ModuleMapKeeps track of loaded binaries across guest process (exe, dll, so, etc.). + + + + + + + + TestCaseGeneratorHelper methods to reconstruct concrete files from symbolic variables + + + + + + + + RecipeChecks execution for vulnerability conditions (e.g., symbolic addresses) and applies PoV recipes. + + + + + + + + POVGenerationPolicyLimits how many PoVs can be generated for a given program counter. + + + + + + + + FilePovGeneratorEncodes PoVs as a concrete input file for binaries that take file names as input. + + + + + + + + DecreePovGeneratorGenerates CGC-style C and XML PoVs for interactive binaries. + + + + + + + + OS Event MonitorsNotify client plugins about process, thread, module load/unload and other OS-specific events. + + + + + + + Guest OSWindow, Linux, etc. + + + + + + + + MemoryMapKeeps track of memory regions allocated by processes in the guest. + + + + + + + + ProcessExecutionDetectorProvides APIs for other plugins to determine if the currently running guest process is of interest (e.g., so that plugins can only instrument processes under analysis). + + + + + + + + CorePluginLow-level guest execution events (e.g., onTranslateBlockStart, onCustomInstruction, etc.).Every plugin in S2E subscribes to at least one of these in order to instrument guest code. + + + + + + + Guest drivers2e.syss2e.ko + + + + + + To other plugins ... + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Core plugins + + + + + + + Domain-specific plugins + + + + + + Other plugins omitted:Search heuristicsExecution tracersCode coverageCrash dump generatorsThey can be enabled/disabled as needed. + + + + + + + + \ No newline at end of file diff --git a/docs/src/Tutorials/PoV/cadet_00001_tui.png b/docs/src/Tutorials/PoV/cadet_00001_tui.png new file mode 100644 index 00000000..830c7f3c Binary files /dev/null and b/docs/src/Tutorials/PoV/cadet_00001_tui.png differ diff --git a/docs/src/Tutorials/PoV/index.rst b/docs/src/Tutorials/PoV/index.rst new file mode 100644 index 00000000..9b521cdf --- /dev/null +++ b/docs/src/Tutorials/PoV/index.rst @@ -0,0 +1,432 @@ +=============================================================== +Using S2E to generate PoVs for Linux, Windows, and CGC binaries +=============================================================== + +In this tutorial, you will learn: + +* How to configure S2E in order to automatically generated proofs of vulnerability (PoVs) for Linux, + Windows, and Decree binaries. +* How S2E implements PoV generation, so that you can extend it for your own needs. We will show an overview + of the S2E plugin architecture as well as the plugins involved in PoV generation. + +.. note:: + + Before starting this tutorial, make sure that... + * you understand the `theory `__ behind automated PoV generation + * you have a working S2E environment ready (e.g, in the ``~/s2e/env/`` directory) + + +Quickstart on Windows and Linux +=============================== + +S2E comes with a ``vulnerabilities`` demo which demonstrates several types of vulnerabilities that S2E can detect. +You can find the corresponding source file in +`vulnerabilities.c `__. + +1. Build the guest tools. We will use the 32-bit Linux version here. + You may also use the 64-bit version and/or the Windows version (in ``guest-tools{32|64}-win``). + +.. code-block:: console + + $ cd ~/s2e/env/build/s2e/guest-tools32 + $ make + +2. Create an analysis project + +.. code-block:: console + + $ cd ~/s2e/env + $ s2e new_project -i debian-9.2.1-i386 -n vuln-lin32-32 --enable-pov-generation \ + build/s2e/guest-tools32/common/demos/vulnerabilities @@ + +Here is a breakdown of the ``s2e new_project`` command above: + +* It creates a new project called ``vuln-lin32-32`` that will run on the ``debian-9.2.1-i386`` image. + +* The ``--enable-pov-generation`` is important in order to generate PoVs. If you omit it, you will just get random test + cases, which will at best crash the binary depending on what the constraint solver generates. Vulnerability generation + involves a number of additional plugins that may add overhead to the analysis, so they are disabled by default unless + you actually need them. + +* Finally, the ``@@`` marker tells that the binary takes as input a file. This marker will be replaced at runtime with + an actual file containing symbolic data. + +.. note:: + + You should build the desired guest image first, or use the ``-d`` flag if you want to download a pre-built image + instead. Only Linux-based images are available for download. You must build Windows images yourself. + +3. Launch S2E and wait until the analysis completes (it should take less than 30 seconds). + +.. code-block:: console + + $ cd ~/s2e/env/projects/vuln-lin32-32 + $ ./launch-s2e.sh + +Look at the files inside ``~/s2e/env/projects/vuln-lin32-32/s2e-last``. This folder contains all the analysis results. +You should have files that look like this: + +.. code-block:: console + + $ ls ~/s2e/env/projects/vuln-lin32-32/s2e-last + _tmp_input-crash@0x0-pov-unknown-0 + _tmp_input-crash@0x0-pov-unknown-3 + _tmp_input-recipe-type1_i386_generic_reg_ebp.rcp@0x8048760-pov-type1-0 + _tmp_input-recipe-type1_i386_generic_reg_edi.rcp@0x8048760-pov-type1-0 + _tmp_input-recipe-type1_i386_generic_shellcode_eax.rcp@0x804883e-pov-type1-3 + ... + +These are concrete input files that demonstrate the vulnerability. Pick one of them and run it on the binary on your +host machine under GDB, as follows. + + +4. Confirm that the PoVs exercise the vulnerabilities + +.. code-block:: console + + $ gdb --args ./vulnerabilities s2e-last/_tmp_input-recipe-type1_i386_generic_shellcode_eax.rcp@0x804883e-pov-type1-3 + (gdb) r + + +The program should crash at program counter ``0x44556677`` and contain ``0xccddeeff`` in the ``eax`` register. + +.. code-block:: console + + Starting program: /home/user/s2e/env/projects/vuln-lin32-32/vulnerabilities + s2e-last/_tmp_input-recipe-type1_i386_generic_shellcode_eax.rcp@0x804883e-pov-type1-3 + + Demoing function pointer overwrite + + Program received signal SIGSEGV, Segmentation fault. + 0x44556677 in ?? () + (gdb) info registers + eax 0xccddeeff -857870593 + ecx 0x44556677 1146447479 + edx 0x40 64 + ebx 0x0 0 + esp 0xffffcd1c 0xffffcd1c + ebp 0xffffcd68 0xffffcd68 + esi 0x804b008 134524936 + edi 0xf7fa2000 -134602752 + eip 0x44556677 0x44556677 + eflags 0x10246 [ PF ZF IF RF ] + cs 0x23 35 + ss 0x2b 43 + ds 0x2b 43 + es 0x2b 43 + fs 0x0 0 + gs 0x63 99 + + +That's it. What you have here is an input file that proves that an attacker has full control of the program counter as +well as an additional general purpose register. + +Understanding recipes +===================== + +So where do the magic values ``0x44556677`` and ``0xccddeeff`` come from? Short answer: from the ``FilePovGenerator`` +plugin configuration. Open ``s2e-config.lua`` and look for the following section: + +.. code-block:: lua + + ------------------------------------------------------------------------------- + -- This plugin writes PoVs as input files. This is suitable for programs that + -- take their inputs from files (instead of stdin or other methods). + add_plugin("FilePovGenerator") + pluginsConfig.FilePovGenerator = { + -- The generated PoV will set the program counter + -- of the vulnerable program to this value + target_pc = 0x0011223344556677, + + -- The generated PoV will set a general purpose register + -- of the vulnerable program to this value. + target_gp = 0x8899aabbccddeeff + } + +You can specify in the configuration file which register values to set for the concrete test case. +The configuration accepts 64-bit values, which the PoV generator will truncate if the program is 32-bit. + + +Now, in order to understand why these values get written into the program counter and the ``eax`` register, we need to +look at recipes. If you look closer, you will notice that the test case file name above contains +``type1_i386_generic_shellcode_eax``. This means that this test case was derived from the +``type1_i386_generic_shellcode_eax.rcp`` recipe. You can find the recipes in the `recipes` folder in your project +directory. The recipe used here has the following contents: + + +.. code-block:: console + :caption: type1_i386_generic_shellcode_eax.rcp + + # Set GP and EIP with shellcode + # mov eax, $gp + # mov ecx, $pc + # jmp ecx + :reg_mask=0xffffffff + :pc_mask=0xffffffff + :type=1 + :arch=i386 + :platform=generic + :gp=EAX + :exec_mem=EIP + [EIP+0] == 0xb8 + [EIP+1] == $gp[0] + [EIP+2] == $gp[1] + [EIP+3] == $gp[2] + [EIP+4] == $gp[3] + [EIP+5] == 0xb9 + [EIP+6] == $pc[0] + [EIP+7] == $pc[1] + [EIP+8] == $pc[2] + [EIP+9] == $pc[3] + [EIP+10] == 0xff + [EIP+11] == 0xe + +Here is what every line of this recipe means: + +* ``reg_mask`` and ``pc_mask`` indicate which bits of the general purpose register and program counter can be + controlled by the attacker. It will be ``0xffffffff`` in almost all cases, meaning that all bits can be controlled. + +* ``type`` can be either 1 or 2 (see DARPA's CGC `terminology + `__). + Type 1 recipes control register values. Type 2 allow memory exfiltration. + +* ``arch`` and ``platform`` define when the recipe is applicable. In this case, we have a recipe that works on 32-bit + programs and all OSes (Windows, Linux, etc...). If you have OS-specific shellcode, you must set the ``platform`` + field accordingly. Please refer to the recipe plugin implementation to get an updated list of supported platforms. + +* ``gp`` means which general purpose register this recipe controls + +* ``exec_mem`` indicates which register must point to executable memory in order for the shell code to work. + Executable memory is not required for more complex recipes that use ROP chains. + +* ``[EIP+0] == 0xb8`` means that the first byte of the memory location referenced by ``EIP`` at the moment of + exploitation must contain ``0xb8``. The value of ``EIP`` itself is not directly controlled by the recipe. + Instead, the recipe plugin enumerates all suitable memory areas in the program's address space and picks one area that + satisfies the constraints (i.e., the area contains enough symbolic bytes that can be suitably constrained to generate + the PoV). This also means that the chosen area must be fixed across program invocations. If the recipe plugin ends up + choosing, say, an executable stack location, the resulting PoV may not be replayable (i.e., running it in GDB may not + produce the desired crash). In practice, if the guest OS uses data execution prevention (DEP)s, you will need to + encode a ROP chain in the recipe. The ``vulnerabilities`` demo allocates an executable area at a fixed location in + order to simplify the recipe. + + + +PoVs for DARPA Decree/CGC binaries +================================== + +DARPA's `Cyber Grand Challenge `__ (CGC) was the world's first all-machine +hacking tournament. S2E was a key component in CodeJitsu's Cyber Reasoning System (CRS) and was used to automatically +find vulnerabilities and exploit them. This demo walks you through the process of using S2E to find and generate a +"proof of vulnerability" (PoV - i.e. an exploit) in a CGC challenge binary (CB). + +The CGC Final Event (CFE) ran on the Decree operating system. Decree is a modified Linux OS with a reduced number of +`system calls `__. In addition to this, the Decree OS has been modified +to add "hook points" for S2E (e.g. to signal process creation, termination, etc.) and to allow S2E to inject symbolic +values. The source code for the Decree OS is available at https://github.com/S2E/s2e-linux-kernel. A Decree +virtual machine image can be built by running the following command: + +.. code-block:: console + + $ cd ~/s2e/env + $ s2e image_build cgc_debian-9.2.1-i386 + + +Next, create an analysis project for a challenge binary. Sample CBs are available `here +`__ and can be built using the instructions `here +`__ . The +remainder of this tutorial will focus on the CADET_00001 program (a pre-compiled version of which is available `here +`__), but the ideas and techniques should be applicable +to all of the CBs. + +The following command creates a ``projects/CADET_00001`` directory with various scripts and configuration files needed +by S2E, as described `here <../../s2e-env.rst>`__. + +.. code-block:: console + + $ s2e new_project --image cgc_debian-9.2.1-i386 ./source/s2e/decree/samples/CADET_00001 + +Finally, to start S2E, run the following command: + +.. code-block:: console + + $ s2e run CADET_00001 + +This will display a TUI-based dashboard, similar to that used by the American Fuzzy Lop (AFL) fuzzer. As S2E finds +vulnerabilities, it generates PoV files in the ``s2e-last`` directory. These files have either ``.xml`` or ``.c`` +file extensions. Once some PoV files have been generated you can press ``q`` to stop S2E. + +.. image:: cadet_00001_tui.png + +Alternatively, you can run S2E without the TUI by using the ``-n`` option in ``s2e run``. Instead of the TUI you will +see the standard S2E output. Once some POVs have been generated you can stop S2E by killing the process with +``Ctrl+C`` or ``killall -9 qemu-system-i386``. + + +Understanding CGC-style PoVs +============================ + +If you followed the tutorial on PoV generation on Linux, you will notice that the PoV format for CGC binaries is +different. Instead of being a concrete input file, CGC binaries produce PoVs in ``.xml`` or ``.c`` format. The reason +for this is that CGC binaries read their input from ``stdin`` and write results to ``stdout``. So in order to exercise +the vulnerability, the PoV must implement a two-way communication with the program, by reading the program's output and +writing an appropriate input. This is different from file-based PoVs, where all the input is sent to the program at +once, and the program's output is ignored. + +.. note:: + + Many binaries, not just CGC binaries, use an interactive type of communication format, where input is read from + stdin and results are written to stdout (e.g., command line utilities). S2E only supports file-based PoVs on Linux + and Windows. Supporting interactive binaries for other platforms is work in progress. + + +For this reason, replaying a CGC-style PoV is more complex. It requires a special setup so that the PoV can communicate +with the CB. For more details, see `here +`__. +The following outlines the steps required to replay a PoV: + +1. Follow the instructions `here + `__ to + setup and run the CGC testing VM + +2. As discussed in the instructions in the previous step, files can be shared between the host and CGC testing VM via + the ``/vagrant`` directory. Copy the CADET_00001 binary, the PoV XML files generated by S2E and `this + `__ script (located in your S2E environment in + ``bin/cgc-tools/test_pov.sh``) to the CGC testing VM. + +3. Run ``vagrant ssh`` to access the VM and copy the files from ``/vagrant`` into ``/home/vagrant``. Then run the + ``test_pov.sh`` script to check the PoV's correctness. + + +Plugin architecture overview +============================ + +In this section, we will give an overview of the plugins involved in generating a PoV. The diagram below summarizes +the relationship between plugins. + +.. image:: arch.svg + +S2E has an event-based plugin architecture. The execution engine exports a core set of low level events (declared in +`CorePlugin.h `__) to which plugins must +subscribe if they want to do anything useful. The most important core events are related to guest instruction +translation. Plugins must use them if they want to instrument guest code (e.g., to be notified when some instructions +of interest are executed). Plugins may also define and export their own high-level events that other plugins can +listen to. For example, an OS monitoring event could instrument the guest kernel so that it can notify other plugins +about process and thread creation. + +S2E plugins can be classified in roughly two sets: generic plugins and domain-specific plugins. Generic plugins can be +seen as library functions in a programming language, which form building blocks for domain-specific plugins (i.e., the +application/tool built on top of S2E). In the diagram above, the generic plugins take care of abstracting away low level +details of guest execution (e.g., keeping track of processes running in the guest for a given execution state, building +the memory and module map, etc). Domain-specific plugins rely on these generic plugins to simplify their implementation +and focus on the problem to solve. For example, the recipe plugin can focus on monitoring vulnerable instructions +in the processes of interest. It does not have to worry about figuring out to which process every instruction belongs. +OS monitoring plugins take care of that. + +S2E provides two ways to monitor guest execution: introspection or guest agents. Introspection consists of observing the +execution stream or guest memory and react to OS-specific constructs without injecting code or modifying the guest in +any way. For example, in order to get the current thread and process ID for Windows guests, the ``WindowsMonitor`` +plugin probes several kernel data structures in memory. ``WindowsMonitor`` does this without involving the guest. A +guest agent is a program that runs in the guest and uses the guest's APIs in order to communicate relevant events to S2E +plugins. For example, ``WindowsMonitor`` relies on the ``s2e.sys`` guest driver to monitor some OS events that would be +difficult to catch with pure introspection. + +In general, it is much simpler to use guest agents whenever possible. For example, in order to monitor Windows processes +creation, it is simpler to write a guest driver that registers a callback through the +``PsSetCreateProcessNotifyRoutine`` kernel API and then calls S2E via ``s2e_invoke_plugin("WindowsMonitor", ...);`` +instead of having ``WindowsMonitor`` figure out where Windows stores its data structures in memory and then trying to +parse them. The difficulty is compounded by the fact that these structures are mostly undocumented and change with every +OS revision. + +Some tasks can be solved by combining introspection with guest agents. One example is getting the current process and +thread ID from a plugin. Ideally, the plugin should be able to call the guest OS directly in order to get this +information. However, the execution model of S2E only allows the guest calling plugins, not the reverse. So in order, to +get these IDs, ``WindowsMonitor`` has to use introspection. However, it gets some help from the guest agent, which +communicates to ``WindowsMonitor`` the locations of key Windows data structures in memory and offsets in these +structures. + +Plugins involved in PoV generation +================================== + +In the previous section, we gave an overview of the S2E plugin architecture. Here we explain in more details the +plugins that are involved in detecting vulnerabilities and generating PoVs. + +**Recipe** + This is the most important plugin for PoV generation. It monitors execution and looks for interesting symbolic + addresses (assignment to program counters and memory reads/writes). When it finds one, it tries to constrain + the address in such a way that leads to controlling registers and memory content according to the specification in + the recipe. The recipe plugin supports `Type 1 and Type 2 + `__ + PoVs. PoV generation will not work without recipes. + +**PovGenerationPolicy** + Sometimes, the recipe plugin generates dozens of redundant PoVs that have only small variations. + This plugin filters the PoVs generated by the recipe plugin to keep only those that are interesting. + When the plugin decides that a PoV is worth keeping, it calls the actual PoV generation plugin, + e.g., ``DecreePovGenerator`` or ``FilePovGenerator`` described below. + +**DecreePovGenerator** + This plugin generates PoVs using the `standard + `__ defined by + DARPA. This PoV generator is designed for interactive programs that consist of a sequence of reads and writes to the + standard input/output pair. + +**FilePovGenerator** + This plugin generates PoVs suitable for use by programs that take simple files as input and do not interact + through the standard input/output. + +**CGCInterface** + This plugin collects PoVs and other interesting events and sends them to a backend server through a JSON API. + This is useful if you want to integrate S2E into a cluster and monitor progress from a centralized console. + We built this plugin for the CGC competition initially (hence the name), but it could be made more generic. + + +The plugins above are useful to generate PoVs. They do not help finding them. Finding vulnerabilities is the job +of search heuristics, described below: + + +**SeedSearcher** + Seed files (or test inputs) are concrete inputs for the program under analysis. These files can be anything that + the program accepts (PNG files, documents, etc.). They can be obtained from a fuzzer, generated by hand, etc. + For CGC, seeds are binary executables compiled from XML of C PoV format. + + The ``SeedSearcher`` plugin fetches seed files to concolically guide execution in the target program. Seed files + are placed in the seeds directory. During analysis, the ``SeedSearcher`` plugins polls the seeds directory for new + seeds. When it finds new seeds, the plugin forks a new state that fetches the new seed and then runs the binary + using that seed as input. + + Seed files can have different priorities. For example, if a fuzzer finds a seed that crashes the program, S2E may + want to use that seed before others that, e.g., only cover new basic blocks. The priority of a seed is specified in + its name. Seed files use the following naming convention:: + + -. + + The index specifies the order of the seed. ``SeedSearcher`` fetches seed files by increasing index number. Higher + priorities are specified with higher integer. In a given batch of seeds, ``SeedSearcher`` will schedule those with + the highest priority first. + + When there are many seed files, it is advantageous to run S2E on multiple cores. In this mode, the ``SeedSearcher`` + will automatically load balance available seeds across all available cores. For example, if there are 40 cores + available, ``SeedSearcher`` will attempt to run 40 seeds in parallel. + + The ``SeedSearcher`` plugin works in conjunction with the guest bootstrap file. The bootstrap file is built in such + a way that state 0 runs in an infinite loop and forks a new state when a new seed is available. If there are no + seed files, the bootstrap forks a state in which the program is run without seeds. + + +**CUPA Searcher** + This searcher implements the Class Uniform Path Analysis (CUPA) algorithm as described in `this + `__ paper. It can work together with the ``SeedSearcher`` plugin. + +The bootstrap script +==================== + +The bootstrap script is a file called ``bootstrap.sh`` that the guest fetches from the host and executes. It contains +instructions on how to execute the program under analysis. More detail can be found in the +`s2e-env <../../s2e-env.rst>`__ documentation. + +The CGC ``bootstrap.sh`` script slightly differs from Linux bootstraps. One key difference is that seeds will always be +enabled for CGC projects, so the ``while`` loop in the ``execute`` function will exist even if you do not intend to use +seed files. Note that it will not affect symbolic execution - the ``SeedSearcher`` (described above) will simply never +schedule this state (state 0) for execution. This state will always exist, which means that even if S2E explores all +execution paths in the CB (which is possible for simple binaries such as CADET_00001), the analysis will not end +because not all states have been killed. The analysis must therefore be manually stopped. diff --git a/docs/src/Tutorials/PoV/pov.rst b/docs/src/Tutorials/PoV/pov.rst new file mode 100644 index 00000000..9638a07c --- /dev/null +++ b/docs/src/Tutorials/PoV/pov.rst @@ -0,0 +1,601 @@ +======================================================== +Automated Generation of Proofs of Vulnerability with S2E +======================================================== + +In this tutorial, we will show you how to use the S2E analysis platform to automatically find vulnerable spots in +binaries and generate proofs of the existence of the vulnerabilities. These proofs can then be used by developers to +easily reproduce, understand, and fix the bugs that lead to the vulnerabilities. + +S2E is a platform for in-vivo analysis of software systems that combines a virtual machine with symbolic execution. +Users install and run in S2E any x86 or ARM software stack, including programs, libraries, the OS kernel, and drivers. +S2E comes with a comprehensive set of plugins to perform various types of analyses, such as bug finding, performance +profiling, reverse engineering, and of course vulnerability analysis as well as proof of vulnerabilitiy (PoV) +generation. Attackers exploit binaries by supplying carefully-crafted inputs that force the program to execute malicious +code or leak confidential data out of the program's memory. An attacker typically feeds programs abnormally large +strings in an attempt to gain control of the program counter. The attacker would choose the input data such that the +corrupted program counter points to the location of the malicious payload, e.g., the attacker's shellcode. + +Automating this can be decomposed in two parts: finding the vulnerable program location and generating the proof of +vulnerability (PoV). In this tutorial, we will assume that the vulnerable instruction has been found and will +focus on explaining how to generate the PoV. + +.. note:: + + Although not required, we recommend that you get familiar with the DARPA CyberGrandChallenge (CGC) in order to have + a better understanding of this tutorial. The CGC documentation is on + `Github `__ + and details about the event are `here `__. + + DARPA's `Cyber Grand Challenge `__ (CGC) was the world's first + all-machine hacking tournament. S2E was a key component in CodeJitsu's Cyber Reasoning System (CRS) and was used to + automatically to find vulnerabilities and exploit them. This tutorial walks you through the theory behind automated + PoV generation. After you are done reading it, you can get your hands dirty in this `follow-up `__. + + +Understanding the Execution of a Vulnerable Program +=================================================== + +Consider the following program. It receives data from the network and stores it into a buffer. The buffer has a length +of 4 bytes and the receive function tries to write 12 bytes into it, causing a buffer overflow. + +.. code-block:: c + :number-lines: + + int main(int argc, char **argv) + { + char buf[4]; + receive(buf, 12); + return 0; + } + + +Since the processor executes machine instructions, we must reason about the binary form of this program. Here is how the +assembly code for this program could look like, with the most important instructions explained. + +.. code-block:: none + :number-lines: + + push ebp + mov ebp, esp + sub esp, 4 ; Allocate 4 bytes on the stack for buf + lea eax, [ebp-4] ; Compute address of buf + push 12 ; Push 12 for 2nd parameter of receive + push eax ; Push address of buf for 1st param of receive + call receive + xor eax, eax ; Set return value to 0 + leave ; Clean the stack frame + ret ; Return from the main function + +Let's now see what the program would execute if the attacker sends the string ``AAAABBBBCCCC``. Assume that the stack +pointer register is ``0xf0`` at instruction 1 and the frame pointer is ``0x1000``. After executing instruction 6 and +right before calling receive, the program stack could look like this: + +.. code-block:: none + + 0xf8: 0xabc0 ; argv + 0xf4: 0xdef0 ; argc + 0xf0: 0x800231 ; return address + 0xec: 0x1000 ; saved frame pointer + 0xe8: buf ; space for the buffer (allocated at line 3) + 0xe4: 12 ; size of the parameter passed to receive + 0xe0: 0xe8 ; address of the buffer on the stack + +After calling receive, the stack looks like this: + +.. code-block:: none + + 0xf8: 0xabc0 ; argv + 0xf4: 0xdef0 ; argc + 0xf0: CCCC ; return address + 0xec: BBBB ; saved frame pointer + 0xe8: AAAA ; space for the buffer (allocated at line 3) + 0xe4: 12 ; size of the parameter passed to receive + 0xe0: 0xe8 ; address of the buffer on the stack + + +As you can see, the receive call overwrote the neighbouring memory locations, corrupting the original frame pointer and +the return address. The return instruction at line 10 will jump straight to the address ``CCCC``, fully controlled by +the attacker. All the attacker needs to do here is to figure out what bytes of the inputs end up in the program counter. + +An automated PoV generator would have to do here is to detect when the input reaches the program counter, figure out +which bytes of the input end up in the program counter, and compute the actual input byte values so that the program +counter has the desired address. + + +Using Symbolic Execution to Generate PoVs +----------------------------------------- + +Performing this mapping can be done with a simple technique called symbolic execution. In normal execution (aka +"concrete" execution), the program gets concrete inputs (e.g., ``1``, ``2``, ``"abc"``, etc.), performs computations on +them, and produces concrete outputs. In symbolic execution, the program gets "symbolic" inputs instead (e.g., λ\ +:sub:`0`\ λ\ :sub:`1`\ λ\ :sub:`2`\ λ\ :sub:`3`\ ). These symbolic inputs propagate through the program and build +mathematical formulas ("symbolic expressions" or "symbolic values") as execution progresses. + +Symbolic values coexist side-by-side with concrete values, and just like concrete values, can be read and written to +memory and processor registers. Moreover, at any point of execution, it is possible to plug any such mathematical +formula into a solver in order to compute concrete inputs, e.g., to generate a PoV. + +Executing a program symbolically requires a symbolic execution engine. You can think of it as an emulator that +continuously fetches binary instructions, decodes them, checks if the operands contain symbolic data, and if so creates +a symbolic expression out of the operands, and otherwise computes the result concretely as usual. The engine extends the +register file and the memory with an array of pointers that store a reference to the symbolic expression or null if the +location is concrete. When the system starts, there is no symbolic data in the system and everything runs concretely. In +order to initiate symbolic execution, the engine therefore needs to provide a mechanism to create fresh symbolic +variables and write them to the desired memory location. S2E, which is based on virtualization, conveniently provides a +custom machine instruction (e.g., a special x86 instruction for x86 targets) that can be used from inline assembly. + + +In order to run the program above symbolically, one needs to define a source of symbolic values. This source is the +receive system call. The symbolic execution engine would need to somehow intercept the call to ``receive`` and replace +it with a custom implementation that injects symbolic values into the buffer instead of reading concrete data from the +network. When using S2E, this can be easily done with ``LD_PRELOAD`` or, for static binaries, by tweaking the receive +syscall in the Linux kernel. S2E provides a custom x86 instruction to create symbolic values. For the example above, +this can be as simple as transforming receive into: + +.. code-block:: c + :number-lines: + + int receive(void *buf, size_t size) + { + s2e_make_symbolic(buf, size, "input_buffer"); + return size; + } + +``s2e_make_symbolic`` is nothing more than a function written in assembly that contains a hard-coded sequence of bytes +for the custom x86 opcode that instructs the symbolic execution engine to write a fresh symbolic value to the desired +memory location. Each symbolic variable gets a name (e.g., ``"input_buffer"``) in order to simplify test case +generation. When running the previous example inside a symbolic execution engine, the stack would look like this when +receive returns: + +.. code-block:: none + + 0xf8: 0xabc0 ; argv + 0xf4: 0xdef0 ; argc + 0xf0: λ11λ10λ9λ8 ; input_buffer[8..11] + 0xec: λ7λ6λ5λ4 ; input_buffer[4..7] + 0xe8: λ3λ2λ1λ0 ; input_buffer[0..3] + 0xe4: 12 ; size of the parameter passed to receive + 0xe0: 0xe8 ; address of the buffer on the stack + +The symbolic execution engine eventually reaches the return instruction at line 10, at which point it tries to write the +symbolic value at address 0xe8 into the program counter. The engine detects that the value is symbolic and stops +execution. The engine cannot continue execution at this stage because it does not know the target of a symbolic program +counter. A symbolic program counter could point to any memory location and the analysis engine would have a pretty hard +time choosing on its own a concrete value that makes sense. + +This is where S2E analysis plugins come into play. Plugins hook into the execution engine and react to various events of +interest. The S2E engine exposes dozens of events, allowing developers to implement powerful analysis tools. For +example, plugins could observe the instruction stream and react to symbolic pointers. This is useful for PoV generation, +as symbolic pointers that end up in critical registers (like a program counter) are often an indication of a +vulnerability. Plugins could also look at which instructions were executed, in order to compute code coverage, etc. + +S2E uses the ``Recipe`` plugin in order to determine whether an instruction can be exploited and generate inputs for the +PoV. The recipe plugin takes as input a set of pre-computed constraints for registers (the "recipe"). When a potentially +vulnerable spot is reached, the plugin appends the recipe constraints to the current set of path constraints, then asks +the solver to compute concrete inputs. If the solver succeeds in computing the inputs, the plugin found a PoV. If not, +the recipe plugin resumes execution, looking for other vulnerable spots. In the example above, suppose that the recipe +states that the program counter must be equal to ``0x801002`` and the frame pointer must be set to ``0xdeadbeef`` in +order to demonstrate the vulnerability. When execution reaches the return instruction, the solver will be fed additional +constraints λ\ :sub:`11`\ λ\ :sub:`10`\ λ\ :sub:`9`\ λ\ :sub:`8`\ == 0x00801002 and λ\ :sub:`7`\ λ\ :sub:`6`\ λ\ +:sub:`5`\ λ\ :sub:`4`\ == 0xdeadbeef. The solver will determine that this is feasible, and will then return the +following concrete input bytes: ``ff ff ff ff ef be ad de 02 10 80 00``. Values for λ\ :sub:`3`\ λ\ :sub:`2`\ λ\ +:sub:`1`\ λ\ :sub:`0`\ are not important (i.e., they have no constraints), so the solver can choose anything for them +(here ``0xffffffff``). + +The following is the simplest possible recipe accepted by the ``Recipe`` plugin. It specifies a ``Type 1`` +vulnerability, in which the attacker can control the program counter (EIP register), as well as a general purpose +register (here, it is ``EAX``). The mask specifies which bits of these registers the attacker can control. The lines of +the form ``EIP[0] == $pc[0]`` represent constraints on the symbolic registers. The left hand side is the register, the +right hand side is a variable that represents a concrete value negotiated with the CGC framework (the framework chooses +a random ``EIP`` value to check that the exploit works for any ``EIP`` value). + +.. note:: + + We use the DARPA CyberGrandChallenge terminology, which defines ``Type 1`` and ``Type 2`` vulnerabilities. + Refer to the CGC `documentation `__ for more details. + +.. code-block:: none + + :type=1 + :arch=i386 + :platform=generic + :gp=EAX + :reg_mask=0xffffffff + :pc_mask=0xffffffff + EIP[0] == $pc[0] + EIP[1] == $pc[1] + EIP[2] == $pc[2] + EIP[3] == $pc[3] + EAX[0] == $gp[0] + EAX[1] == $gp[1] + EAX[2] == $gp[2] + EAX[3] == $gp[3] + +The following is a more complex recipe that contains shellcode. The lines of the form ``[EIP+XXX] == YY`` represent a +constraint on a memory location at address ``EIP + XXX``. For example, ``EIP+0`` must be equal to ``0xb8``. When the +symbolic execution engine encounters a symbolic program counter, it checks that the recipe constraints can be satisfied, +and if so, generates the PoV. + +.. code-block:: none + + # Set GP and EIP with shellcode + # mov eax $gp + # mov ebx, $pc + # jmp ebx + :type 1 + :reg_mask=0xffffffff + :pc_mask=0xffffffff + :gp=EAX + :exec_mem=EIP + [EIP+0] == 0xb8 + [EIP+1] == $gp[0] + [EIP+2] == $gp[1] + [EIP+3] == $gp[2] + [EIP+4] == $gp[3] + [EIP+5] == 0xbb + [EIP+6] == $pc[0] + [EIP+7] == $pc[1] + [EIP+8] == $pc[2] + [EIP+9] == $pc[3] + [EIP+10] == 0xff + [EIP+11] == 0xe3 + +Identifying Advanced Vulnerability Patterns with S2E +==================================================== + +In the previous sections, we explained how to detect basic return address overwrites and generate simple PoVs. The idea +was to use symbolic execution in order to track the flow of symbolic input data into sensitive registers, such as the +program counter, then use the constraint solver in order to generate valid PoVs according to pre-computed recipes. PoV +generation leverages the ability of S2E to detect memory accesses through symbolic pointers, detect changes of control +flow to a symbolic address, and detect function calls with symbolic parameters. When S2E detects these events, it +notifies the recipe plugin. The plugin then goes through the set of recipes and if one of them satisfies the current +path constraints, generates a PoV. This is sufficient to exploit stack/heap overflows, arbitrary memory writes, lack of +input validation, etc. + +In this section, we will look into more advanced vulnerability patterns that S2E can detect. All these patterns are +based on the ability of S2E to detect uses of symbolic pointers, like assignment to program counter or simple +dereference. We will see how to detect and exploit function pointer overwrites, reads and writes to arbitrary memory +locations, as well as function calls that have symbolic parameters. + +Function Pointer Overwrite +-------------------------- + +In the following example, ``f_ptr`` is overwritten by the receive function. So instead of calling ``f_ptr``, the program +ends up calling a pointer set by the attacker. + +.. code-block:: c + :number-lines: + + int main(int argc, char **argv) + { + int (*f_ptr)(void); + char buffer[32]; + f_ptr = f; // f is a function defined elsewhere in the program + receive(buffer, sizeof(buffer) + 4); + return f_ptr(); + } + +When ``f_ptr`` is called, S2E detects the attempt to set ``EIP`` to a symbolic value and tries every available recipe. +This is very similar to the case of return address overwrites, in which the return instruction fetches the symbolic +value stored on the stack and attempts to assign it to the program counter. Here, we have a call (or jump) instruction +that computes the target (e.g., by getting it from a register or from a memory location specified by the operand) before +writing it to the program counter. The recipe plugin catches the write and tries to figure out if there is a recipe that +can force the program counter to go to an interesting address. + +Arbitrary Writes +---------------- + +The code snippet below contains an arbitrary write vulnerability. It exemplifies a situation that commonly occurs with +heap overflow vulnerabilities. An attacker may overwrite the memory location specified by input bytes ``[32:35]`` with +the value specified by input bytes ``[0:3]``. + +.. code-block:: c + :number-lines: + + int main(int argc, char **argv) + { + // Initialize a with the address of a legitimate global variable + int *a = &g_my_var; + char buffer[32]; + + receive(buffer, sizeof(buffer) + 4); + *a = *(int *)buffer; + return *a; + } + +The trick to exploit such vulnerabilities automatically is to collect addresses of all sorts of interesting targets +during execution. Such targets include locations of return addresses on the stack, various code pointers, etc. When S2E +finds an arbitrary write, the recipe plugin uses that write to overwrite every potential target with attacker-controlled +data. Later on, as S2E continues execution, it will detect the use of the overwritten return address and handle it as +the common case of return address / code pointer overwrite. + +The recipe plugin instruments call and ret instructions to keep a LIFO structure for locations of return addresses to be +used as potential targets for arbitrary writes. This is a best effort attempt at exploitation: if the binary interrupts +execution between the arbitrary write vulnerability and the following return instruction (e.g., by means of an exit), +the exploitation attempt will fail. We discuss later ways to identify more potential targets to improve the chances of +successfully exploiting arbitrary writes. + +Arbitrary Reads +--------------- + +S2E also supports exploitation of arbitrary memory reads. The following code snippet has a pointer ``a`` to a structure +that contains a function pointer ``f_ptr``. The program dereferences ``a`` and then calls ``f_ptr``. The attacker can +overwrite ``a`` to point to the buffer buffer, which would allow setting ``f_ptr`` to an arbitrary value and thus +execute arbitrary code. + +.. code-block:: c + :number-lines: + + struct test { + int abcd; + int (*f_ptr)(void); + }; + + struct test g_test1 = {0, my_func1}; + int main(int argc, char **argv) + { + // Initialize a with the address of a legitimate global variable + struct test *a = &g_test1; + char buffer[32]; + + // This receive overflows by 4 bytes, overwriting pointer a + // with attacker-controlled data. + receive(buffer, sizeof(buffer) + 4); + + // Reads attacker-controller pointer value from a, + // then reads the address of a function stored in f_ptr + // (also attacker controlled), and finally calls that function. + a->f_ptr(); + return 0; + } + +When S2E identifies an arbitrary read, the recipe plugin looks for memory locations (e.g., ``buffer``) that contain +symbolic data (i.e., derived from user input). The plugin forces constraints on the target of the read operation (e.g., +``a``) to make it point to one of these locations, and let execution go forward. By doing so, if any of the values read +from symbolic memory are used, e.g., as target of a write operation, or of an indirect control instruction, the plugin +can detect and exploit it as explained in previous scenarios. The line invoking the function pointer ``a->f_ptr()`` +triggers the arbitrary read vulnerability. S2E automatically overwrites pointer ``a`` with the address of buffer, so +that ``f_ptr`` tries to invoke symbolic bytes at ``buffer[4:7]``. This is then handled as a function pointer overwrite +case. + +Function Calls with Symbolic Parameters +--------------------------------------- + +There are cases where the ability to pass arbitrary arguments to certain functions can be exploited to exfiltrate data. +The following example transmits 128 bytes stored at the memory location pointed to by ``a``. Unfortunately, this +location can be controlled by the attacker through a buffer overflow. The attacker can therefore set it to any address +and exfiltrate pretty much anything from the address space of the binary, such as encryption keys, passwords, or other +secrets. + +.. code-block:: c + :number-lines: + + char g_long_string[128] = "..."; + int main(int argc, char **argv) + { + // Initialize a to the address of a legitimate string + char *a = g_a_string; + char buffer[32]; + + // Overflow 4 bytes past the buffer + receive(0, buffer, sizeof(buffer) + 4); + + // a contains attacker-controlled data, allowing to exfiltrate + // any data in the address space. + transmit(a, 128); + return 0; + } + +Detecting such cases for S2E is straightforward. The recipe plugin instruments every critical function (e.g., +``transmit()``) to check whether any of the critical parameters can be made to point to interesting data. It then +applies recipes to produce a ``Type`2`` PoV which aims to leak a flag in the secret page. The challenge is to +automatically identify such functions (not only ``transmit()``) inside the CGC binaries. Before starting the analysis of +the binary, S2E disassembles it, extracts all function addresses, then invokes every function with canned parameters. If +the function produces the expected output, identification succeeded. + + + + S2E uses `RevGen `__, an x86-to-LLVM translator, + in order to extract function types from the binary before analyzing it. + + +Generating Replayable PoVs +========================== + +In the CGC framework, a PoV is a normal program that communicates with the vulnerable binary in order to exploit it. +Communication can be done through a pipe or a network. A PoV can send data to the binary and receive data that the +binary outputs. PoVs are free to make computations on the data they get from the challenge binary in order to generate +input for the binary that will lead to exploitation. + +The example below shows a simple PoV that sends a long string that triggers a buffer overflow in the challenge binary. +Note that even if data sent by the binary is not used, it must still be consumed by the PoV, otherwise the binary could +block when its transmit buffer is full. + +.. raw:: html + + + + + + + +
PoVChallenge Binary
+ +.. code-block:: c + :number-lines: + + int main(...) { + transmit( + "aaaaaaaaaaaaaaaa" + "bbbbbbbbbbbbbbbb" + "cccccccc", 40 + ); + + char buffer[4]; + receive(buffer, 4); + } + +.. raw:: html + + + +.. code-block:: c + :number-lines: + + int main(...) { + char buffer[32]; + + receive(buffer, sizeof(buffer) + 8); + + transmit("done", 4); + } + + +.. raw:: html + +
+ +In this section, we will discuss some of the challenges that symbolic execution engines face in order to generate +correct and reliable PoVs. + +S2E generates a PoV for the above example as follows. First, S2E instruments the program to monitor calls to the receive +and transmit functions. S2E makes the contents of the receive buffer symbolic and records what the binary writes through +the transmit function. It maintains an ordered list of these calls. When a path terminates and is exploitable, S2E +generates concrete inputs and attaches them to the corresponding receive entry in the list. Second, for every receive +invoked by the binary, S2E generates a corresponding write in the PoV. This write contains the concrete data computed by +the solver. Likewise, S2E generates a receive operation for every transmit done by the binary. In its simplest form, the +PoV ignores the contents sent by the binary. + +The complexity of generating a PoV depends on whether the challenge binary is deterministic or not, and whether it +requires the PoV to perform computations. A deterministic binary is one that does not use randomness, making PoV +generation easy. When the symbolic execution engine detects a vulnerable point, it calls the constraint solver in order +to get concrete inputs. These concrete inputs can then be used to exploit the binary. Moreover, they are guaranteed to +work on every exploitation attempt. + +Generating PoVs for non-deterministic input is much harder than for deterministic ones. Non-determinism occurs when the +challenge binary relies on random data to implement its functionality. This often happens in challenge-response +algorithms, where a program sends a random value to the client and expects the client to reply with a correct response +based on some computation on that random value. A simplified version of this is when a program generates a random value +(or "cookie"), sends that cookie to the client, then expects the client to send that cookie back unmodified on the next +request in order to operate properly. + +S2E handles non-deterministic binaries that use simple cookies. Consider the following scenario. A challenge binary +calls the random number generator, records the random number, then transmits it. It then expects to receive that number +from the remote host in order to continue with execution. S2E has no trouble making the random value symbolic and +getting to the vulnerability. The problem is that by default, the generated PoV is invalid: the constraint solver does +not know that the received value has any connection to the written value and as a result generates a bogus concrete +value that does not match the random data. Moreover, the random value will be different on every run, so it is +impossible to hard-code a fixed value in the PoV. The following code snippet shows such a case. + +.. code-block:: c + :number-lines: + + int main(int argc, char **argv) + { + int data; + // S2E returns a symbolic value instead of the original concrete value + int cookie = random(); + // The binary sends the random value to the client + transmit(&cookie, sizeof(cookie)); + // S2E creates a fresh symbolic value for data + receive(&data, sizeof(data)); + // data is not constrained, so S2E can explore both outcomes of the if + if (data != cookie) { + return 0; + } + // When arriving here, S2E generates an unreplayable PoV because + // it did not realize that data and cookie are connected together + vulnerable_code(); + } + + void naive_pov() + { + int data0; + // The POV expects to read 4 bytes of data written by the program. + receive(&data, sizeof(data)); + // 1234 is a random value chosen by the solver for the cookie. It was + // valid only for one path and is unlikely to be useful in the next run. + int data1 = 1234; + transmit(&data1, sizeof(data1)); + } + + +A correct PoV would look like this: + +.. code-block:: c + :number-lines: + + void correct_pov() + { + int data0; + receive(&data0, sizeof(data0)); + int data1 = data0; + // Transmit the previously received data + transmit(&data1, sizeof(data1)); + } + +To generate a correct PoV, S2E looks at all branch conditions and looks for cases where the content of a receive buffer +is compared with a symbolic value derived from the random number generator. Once it found such a comparison, it can +easily generate the correct PoV code by mapping the symbolic value created in the receive call to the symbolic value +written by the transmit function. + +A much harder case happens when the PoV needs to perform computations. Consider the slightly modified above example: + +.. code-block:: c + :number-lines: + + int main(int argc, char **argv) + { + int data; + int cookie = random(); + transmit(&cookie, sizeof(cookie)); + receive(&data, sizeof(data)); + if (data * 8 != cookie) { + return 0; + } + vulnerable_code(); + } + +A valid PoV would look something like this: + +.. code-block:: c + :number-lines: + + void correct_pov() + { + int data0; + receive(&data0, sizeof(data0)); + int data1 = data0 / 8; + // Transmit the previously received data + transmit(&data1, sizeof(data1)); + } + +Generating a valid PoV in the general case where computations on transmitted data are involved requires embedding a +constraint solver directly inside the PoV itself. The PoV would have to solve the equation (``date * 8 == cookie``) in +order to exploit the binary. For some simple cases like here, it may be possible to invert the equation, though in +general, conditions are of the form ``f(x,y,...)=0``, making this task practically impossible without running the actual +solver. The following snippet shows how would an automatically generated PoV with an embedded solver look like. + +.. code-block:: c + :number-lines: + + void correct_pov_with_solver() + { + int data0, data1; + receive(&data0, sizeof(data0)); + // pseudo code that takes data0 as input and computes data1 + solve("%s * 8 == %s", &data0, &data1); + // Transmit the previously received data + transmit(&data1, sizeof(data1)); + } + +Unfortunately, we ran out of time and didn't have time to implement this solution. The main challenge was to fit an +entire solver within the size and memory limits of a PoV, as well as modifying the solver to accommodate a very +restricted runtime environment, that has primitive memory allocation, no standard library, etc. + +Conclusion +========== + +In this tutorial, you have learnt the theory behind automated PoV generation as well as various practical +issues that arise when building a robust PoV generator. Now it is a good time to get your hands dirty +by actually `generating `__ PoVs for a few vulnerable binaries. diff --git a/docs/src/Tutorials/Revgen/Revgen.rst b/docs/src/Tutorials/Revgen/Revgen.rst new file mode 100644 index 00000000..d34ddf33 --- /dev/null +++ b/docs/src/Tutorials/Revgen/Revgen.rst @@ -0,0 +1,436 @@ +======================================== +Translating binaries to LLVM with Revgen +======================================== + +Revgen is a tool that takes a binary and turns ("lifts") it into LLVM bitcode. It does so in four steps: + +* Disassemble the binary using IDA Pro +* Recover the control flow graph (CFG) using `McSema `__ +* Translate each basic block in the CFG into a chunk of LLVM bitcode by using QEMU's translator +* Stitch together translated basic blocks into LLVM functions + +Revgen relies on QEMU's translator to extract the semantics of machine instructions into a simple intermediate +representation (IR) and S2E's LLVM translator to turn this IR into LLVM bitcode. This brings two main advantages over +competing approaches: + +* **Better precision.** QEMU developers spent 15+ years building translators that are precise enough to + emulate actual operating systems. Even a seemingly simple x86 instruction such as ``call`` needs more than ten pages + of pseudo code in the Intel instruction set manual. Revgen effortlessly captures the complex behavior of all + machine instructions, including system instructions not normally used in user programs. + +* **Better reliability.** Implementing a translator from scratch brings bugs and incompleteness. Revgen, however, + can handle pretty much every instruction that QEMU supports, even the most exotic ones. + +In the rest of this document, we will show how to use Revgen, how it works under the hood, and give a brief overview of +its performance. As we will see, the main drawback of relying on QEMU's translator is the high overhead of the generated +code (15-30x larger binaries). Revgen would need to reuse various optimization passes in order to bring this +overhead down. + + +Using Revgen +============ + +In this section, we show how to use Revgen on DARPA CGC binaries. We cleaned up and made stable our initial `prototype +`__ of Revgen in order to analyze CGC binaries, so it supports these binaries the +best. Other types of binaries will be supported in the future. + + +1. Prerequisites +---------------- + +Before starting, make sure that you have a functional S2E environment and that you have a working IDA Pro setup. Check +that: + + * IDA can disassemble CGC binaries. You will need to install the CGC plugins, which you can find + `here `__. If you cannot find pre-compiled plugins for your IDA version, you will + need to compile the plugin yourself using the IDA Pro SDK. + + * IDA has a working Python environment with the ``protobuf`` library installed. + Running ``pip install protobuf`` should work in most cases. + + * The ``IDA_ROOT`` variable is set to the IDA Pro path, e.g., ``export IDA_ROOT=/opt/ida-6.8``. + +Revgen uses `McSema `__ to recover +the CFG of the binary, and that requires IDA Pro. Note that this is an older version of McSema scripts +(from 2016). `McSema2 `__ has been released in the meantime, which should have +much better CFG recovery. We plan to port Revgen to the latest version. + + +2. Build the CGC binaries +------------------------- + +S2E comes with a Docker image and `instructions `__ on how to build +all DARPA CyberGrandChallenge binaries. After you have completed this step, you should have a ``samples`` folder that +contains ~280 binaries: + +.. code-block:: bash + + ls -1 $S2E_ENV/source/s2e/decree/samples + CADET_00001 + CADET_00003 + CROMU_00001 + CROMU_00002 + CROMU_00003 + CROMU_00004 + CROMU_00005 + CROMU_00006 + ... + + +3. Translating a binary +----------------------- + +Let us first translate the ``CADET_00001`` binary. For this, set the ``S2E_PREFIX`` and ``IDA_ROOT`` variables, +then run ``revgen.sh``. This tutorial assumes that your S2E environment is in ``$S2E_ENV``. + +.. code-block:: bash + + export IDA_ROOT=/opt/ida-6.8 + export S2E_PREFIX=$S2E_ENV/install + $S2E_PREFIX/bin/revgen.sh $S2E_ENV/source/s2e/decree/samples/CADET_00001 + +You should get the following console output: + +.. code-block:: console + + stat: cannot stat '$S2E_ENV/source/s2e/decree/samples/CADET_00001.pbcfg': No such file or directory + [IDA ] Writing CFG to $S2E_ENV/source/s2e/decree/samples/CADET_00001.pbcfg... + + [REVGEN ] Translating $S2E_ENV/source/s2e/decree/samples/CADET_00001 to $S2E_ENV/source/s2e/decree/samples/CADET_00001.bc... + warning: Linking two modules of different data layouts: '$S2E_ENV/install/lib/X86BitcodeLibrary.bc' is 'e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128' whereas 'tcg-llvm' is 'e-m:e-i64:64-f80:128-n8:16:32:64-S128' + + [4] RevGen:generateFunctionCall - >Function 0x804860c does not exist + [LLVMDIS] Generating LLVM disassembly to $S2E_ENV/source/s2e/decree/samples/CADET_00001.ll... + [CLANG ] Compiling LLVM bitcode of CGC binary to native binary $S2E_ENV/source/s2e/decree/samples/CADET_00001.rev... + +You will find the following output files in the ``$S2E_ENV/source/s2e/decree/samples`` folder. +The meaning of each file is explained in the ``revgen.sh`` script, but here is an explanation of the most important +ones: + + * ``CADET_00001``: the original binary + * ``CADET_00001.pbcfg``: the CFG extracted by IDA Pro / McSema + * ``CADET_00001.bc``: the LLVM bitcode file created by RevGen + * ``CADET_00001.rev``: the LLVM bitcode file compiled to an ELF binary that you can run on your Linux host + + +4. Running a translated CGC binary +---------------------------------- + +Revgen comes with a runtime library that translates Decree system calls to their Linux counterparts. This allows +you to run the translated Decree binaries on your Linux host. For example, running ``CADET_00001`` as follows: + +.. code-block:: console + + user@ubuntu:~$ $S2E_ENV/source/s2e/decree/samples/CADET_00001.rev + + Welcome to Palindrome Finder + + Please enter a possible palindrome: sdf + Nope, that's not a palindrome + + Please enter a possible palindrome: aaa + Yes, that's a palindrome! + + Please enter a possible palindrome: + +.. warning:: + + Revgen currently supports only CGC binaries. It may or may not be able to generate a bitcode file for other + kinds of binaries (e.g., Linux or Windows) and cannot run non-CGC binaries. + + Some CGC binaries may fail to translate because of various limitations of the (old) McSema script that Revgen uses. + + +Design and implementation +========================= + +Revgen's design is straightforward: it takes a list of basic blocks, calls a translator to turn them to equivalent +pieces of LLVM bitcode, then stitches these pieces of bitcode together in order to reconstruct original functions. + +At a high level, the translator takes a block of machine code (e.g., x86) and turns it into a QEMU-specific intermediate +representation (IR). The translator then transforms this IR to the desired target instruction set (in Revgen's case, +LLVM). The translator is composed of the `CPU emulation library (libcpu) `__, which +generates the IR, and of the `Tiny Code Generator library (libtcg) `__, which handles the +IR to LLVM conversion. We extracted ``libcpu`` and ``libtcg`` from QEMU and made both available as standalone libraries. +We added LLVM translation capabilities to ``libtcg``, which you can find `here +`__. + +In the rest of this section, we will explain in more details how the translator works and how Revgen uses it to build an +LLVM version of an entire binary. We will also see what it takes to run such binaries and discuss the assumptions that +Revgen makes about them. + + +Translating basic blocks to LLVM +-------------------------------- + +Revgen takes the binary file and the CFG recovered my McSema, and turns every basic block in that CFG into a piece of +LLVM code. Revgen stops when it has translated all basic blocks in the CFG. The result is a set of independent LLVM +functions, one for each basic block. Revgen's translator handles basic blocks in two steps: (1) it turns a basic block +into a sequence of micro-operations and then (2) converts them to LLVM instructions. We will see next this process in +more details. + +First, the translator converts machine instructions into an equivalent sequence of micro-operations. For example, the +translator decomposes the x86 instruction ``inc [eax]`` into a load to a temporary register, an increment of that +register, and a memory store. This implements the effects of incrementing the memory location stored in the ``eax`` +register. The resulting sequence of micro-operations forms a *translation block*. + +Second, the translator maps each micro-operation to LLVM instructions, using a code dictionary. The dictionary +associates each micro-operation with a sequence of LLVM instructions that implement the operation. Most conversions are +one-to-one mappings between micro-operations and LLVM instructions (e.g., arithmetic, shift, load/store +operations). + +The translator also handles instructions that manipulate system state. Revgen accurately translates to LLVM +instructions like ``fsave`` or ``mov cr0, eax``. The former saves the state of the floating point unit, while the latter +sets the control register (e.g., to enable 32-bit protected mode, which changes the behavior of many instructions). + +For this, the translator uses *emulation helpers*. An emulation helper is a piece of C code that emulates complex +machine instructions that do not have equivalent micro-operations. Revgen compiles emulation helpers to LLVM and adds +them to the code dictionary, transparently enabling the support of machine instructions that manipulate system state. +Helpers are implemented in ``libcpu`` and you can find them `here +`__. + +Third, the translator packages the sequence of LLVM instructions into an LLVM function that is *equivalent* to the +original basic block taken from the binary. More precisely, given the same register and memory input, the translated +code produces the same output as what the original binary does if executed on a real processor. + +To illustrate this process, let us consider the following function. This function invokes the exit system call +with a status code passed as a parameter on the stack. The function is composed of two basic blocks: one starting +at address ``0x804860C`` and another one at ``0x8048618``. + + +.. code-block:: asm + + .text:0804860C ; int __cdecl sub_804860C(int status) + .text:0804860C sub_804860C proc near + .text:0804860C + .text:0804860C + .text:0804860C status = dword ptr 4 + .text:0804860C + .text:0804860C mov eax, 1 + .text:08048611 push ebx + .text:08048612 mov ebx, [esp+4+status] ; status + .text:08048616 int 80h ; LINUX - sys_exit + .text:08048616 sub_804860C endp + .text:08048616 + .text:08048618 ; --------------------------------------------------------------------------- + .text:08048618 pop ebx + .text:08048619 retn + + +Revgen turns these two blocks into two LLVM functions that look like this: + +.. code-block:: llvm + + define i64 @tcg-llvm-tb-804860c-c-a3-0-4000b7(%struct.CPUX86State* nocapture) local_unnamed_addr #17 { + %2 = getelementptr %struct.CPUX86State, %struct.CPUX86State* %0, i64 0, i32 5 + + ; mov eax, 1 + %3 = getelementptr %struct.CPUX86State, %struct.CPUX86State* %0, i64 0, i32 0, i64 0 + store i32 1, i32* %3, align 4 + + ; push ebx + %4 = getelementptr %struct.CPUX86State, %struct.CPUX86State* %0, i64 0, i32 0, i64 3 + %ebx = load i32, i32* %4, align 4, !s2e.pc !377 + %5 = getelementptr %struct.CPUX86State, %struct.CPUX86State* %0, i64 0, i32 0, i64 4 + %esp = load i32, i32* %5, align 4, !s2e.pc !377 + %6 = add i32 %esp, -4, !s2e.pc !378 + tail call void @__stl_mmu(i32 %6, i32 %ebx, i32 1), !s2e.pc !377 + + ; mov ebx, [esp+4+status] + store i32 %6, i32* %5, align 4 + %7 = add i32 %esp, 4, !s2e.pc !378 + %8 = tail call i32 @__ldl_mmu(i32 %7, i32 1), !s2e.pc !378 + store i32 %8, i32* %4, align 4 + + ; int 0x80 + store i32 134514198, i32* %2, align 4 + tail call void @helper_raise_interrupt(i32 128, i32 2) + ret i64 0 + } + + define i64 @tcg-llvm-tb-8048618-2-99-0-4000b7(%struct.CPUX86State* nocapture) local_unnamed_addr #17 { + ; pop ebx + %2 = getelementptr %struct.CPUX86State, %struct.CPUX86State* %0, i64 0, i32 5 + %3 = getelementptr %struct.CPUX86State, %struct.CPUX86State* %0, i64 0, i32 0, i64 4 + %esp = load i32, i32* %3, align 4, !s2e.pc !379 + %4 = tail call i32 @__ldl_mmu(i32 %esp, i32 1), !s2e.pc !379 + %5 = add i32 %esp, 4, !s2e.pc !380 + store i32 %5, i32* %3, align 4 + + ; retn + %6 = getelementptr %struct.CPUX86State, %struct.CPUX86State* %0, i64 0, i32 0, i64 3 + store i32 %4, i32* %6, align 4 + %7 = tail call i32 @__ldl_mmu(i32 %5, i32 1), !s2e.pc !380 + %8 = add i32 %esp, 8, !s2e.pc !380 + store i32 %8, i32* %3, align 4 + store i32 %7, i32* %2, align 4 + ret i64 0 + } + +Each function takes a pointer to a ``CPUX86State`` structure. This structure models the CPU's register file. All machine +instructions are translated into LLVM instructions that operate on this CPU state structure. +To handle memory accesses, the translator emits calls to ``__stX_mmu`` and ``__ldX_mmu`` helpers. We will explain later +why the translator generates these instead of native LLVM load/store instructions. The ``int 0x80`` instruction is +complex and the translator calls the ``helper_raise_interrupt`` function to handle it. + + +Stitching basic blocks into functions +------------------------------------- + +Now that Revgen created a set of LLVM functions that represent individual basic blocks of the binary, +it needs to assemble them into a bigger function that represents the original function of the binary. +This is straightforward: Revgen creates a new LLVM function and fills it with calls to the translated basic blocks. +So our example above would look like this: + + +.. code-block:: llvm + + define i64 @__revgen_sub_804860c_804860c() local_unnamed_addr #0 { + %1 = getelementptr %struct.CPUX86State, %struct.CPUX86State* @myenv, i64 0 + br label %2 + + ;