diff --git a/.gitignore b/.gitignore index 9c07d4a..4f31ac9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,17 @@ *.class *.log +# scala ignores +# Simple Build Tool +# http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control +dist/* +target/ +lib_managed/ +src_managed/ +project/boot/ +project/plugins/project/ +.history +.cache +.lib/ +# IDE ignores +.bsp/ +.idea/ diff --git a/README.md b/README.md index 4289bb8..dc53e3a 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,122 @@ # joernCpgExport -Export ShiftLeft Code Property Graph (cpg) from OverflowDB to json and csv +Export ShiftLeft Code Property Graph (cpg) from OverflowDB to json and csv. +The csv can be imported into the Neo4j graph database. +Neo4j can be used to visualize the graph and to write queries via cypher. + +**Please note:** The OverflowDB file format is not compatible between different versions. +Be sure to use the same OverflowDB version in joern and joernCpgExport. + +Reference: +* https://github.com/ShiftLeftSecurity/codepropertygraph +* https://github.com/ShiftLeftSecurity/joern/ +* https://github.com/ShiftLeftSecurity/overflowdb +* https://neo4j.com/ + +## Build +```shell +$ sbt stage +``` + +## Usage +```shell +$ ./target/universal/stage/bin/joernCpgExport --help +Usage: joernCpgExport [options] + + -d, --no-default-overlays do not apply default overlays + -o, --no-oss-dataflow do not apply oss dataflow overlay + -c, --cpg load cpg from OverflowDB + -j, --json export cpg as json file + --help prints this usage text +``` + +## Example +```shell +# use fuzzyc2cpg from joern to parse source code and create OverflowDB +$ fuzzyc2cpg.sh --output cpg.bin src +# convert OverflowDB into json +$ joernCpgExport --cpg cpg.bin --json cpg.json +``` + +cpg.json: +```json +{ + "nodes" : [ { + "ID" : 1, + "OVERLAYS" : [ "semanticcpg", "dataflowOss" ], + "VERSION" : "0.1", + "LANGUAGE" : "C", + "TYPE" : "META_DATA" + }, { + "ID" : 2, + "FULL_NAME" : "", + "ORDER" : 1, + "TYPE" : "NAMESPACE_BLOCK", + "NAME" : "", + "FILENAME" : "" + }, { + "ID" : 100, + "TYPE_DECL_FULL_NAME" : "", + "NAME" : "", + "FULL_NAME" : "", + "TYPE" : "TYPE" + }, { + "ID" : 101, + "TYPE_DECL_FULL_NAME" : " [ 1 ]", + "NAME" : " [ 1 ]", + "FULL_NAME" : " [ 1 ]", + "TYPE" : "TYPE" + }, + [...] + ] +} +``` + +## Convert json to csv +You can convert the json file into csv, grouped by node type. +The csv files can be imported into Neo4j via bulk import. +The csv files will be created in the `csv` subfolder of the basedir of the provided `cpg.json`. +Existing files will be overwritten. +```shell +# we need pandas as dependency +$ pip3 install --user pandas +$ scripts/joern-json-csv.py +``` +csv/edge.csv: +```csv +:END_ID,:START_ID,:TYPE,VARIABLE +1024809,1313,REF, +1154373,1314,REF, +1024810,1315,REF, +1154374,1316,REF, +1030528,1317,REF, +1154375,1318,REF, +1024811,1319,REF, +1154376,1320,REF, +1154377,1321,REF, +1154378,1322,REF, +1154379,1323,REF, +``` + +## Import into Neo4j +Neo4j offers two methods to import csv data: 1) via cypher `LOAD CSV` command and 2) via the `neo4j-admin` command. +For (1) you need first to create the graph model. Data types of the csv columns must be explicitly converted during loading. +For (2) the database must be empty (delete all nodes and relations seems not to be sufficient). Data types are detected automatically, or via column header. +See also: https://neo4j.com/developer/guide-import-csv/ + +Neo4j commands: +```shell +# stop Neo4j service +$ neo4j stop +# delete database +$ rm -rf data +# set Neo4j admin password +$ neo4j-admin set-initial-password +# batch import +$ neo4j-admin import --multiline-fields=true --quote=\" --relationships=csv/edge.csv --nodes=csv/vertex_BLOCK.csv --nodes=csv/vertex_CALL.csv ... +# start Neo4j service +$ neo4j start +``` + +Cpg viewed via Neo4j web UI: + +![graph](res/graph.png) \ No newline at end of file diff --git a/build.sbt b/build.sbt new file mode 100644 index 0000000..c453c17 --- /dev/null +++ b/build.sbt @@ -0,0 +1,95 @@ + +// The simplest possible sbt build file is just one line: + +scalaVersion := "2.13.3" +// That is, to create a valid sbt build, all you've got to do is define the +// version of Scala you'd like your project to use. + +// ============================================================================ + +// Lines like the above defining `scalaVersion` are called "settings". Settings +// are key/value pairs. In the case of `scalaVersion`, the key is "scalaVersion" +// and the value is "2.13.3" + +// It's possible to define many kinds of settings, such as: + +name := "joernCpgExport" +organization := "de.peckto" +version := "1.0" + +// Note, it's not required for you to define these three settings. These are +// mostly only necessary if you intend to publish your library's binaries on a +// place like Sonatype or Bintray. + + +// Want to use a published library in your project? +// You can define other libraries as dependencies in your build like this: + +libraryDependencies += "org.scala-lang.modules" %% "scala-parser-combinators" % "1.1.2" + +// Here, `libraryDependencies` is a set of dependencies, and by using `+=`, +// we're adding the scala-parser-combinators dependency to the set of dependencies +// that sbt will go and fetch when it starts up. +// Now, in any Scala file, you can import classes, objects, etc., from +// scala-parser-combinators with a regular import. + +// TIP: To find the "dependency" that you need to add to the +// `libraryDependencies` set, which in the above example looks like this: + +// "org.scala-lang.modules" %% "scala-parser-combinators" % "1.1.2" + +// You can use Scaladex, an index of all known published Scala libraries. There, +// after you find the library you want, you can just copy/paste the dependency +// information that you need into your build file. For example, on the +// scala/scala-parser-combinators Scaladex page, +// https://index.scala-lang.org/scala/scala-parser-combinators, you can copy/paste +// the sbt dependency from the sbt box on the right-hand side of the screen. + +// IMPORTANT NOTE: while build files look _kind of_ like regular Scala, it's +// important to note that syntax in *.sbt files doesn't always behave like +// regular Scala. For example, notice in this build file that it's not required +// to put our settings into an enclosing object or class. Always remember that +// sbt is a bit different, semantically, than vanilla Scala. + +// ============================================================================ + +// Most moderately interesting Scala projects don't make use of the very simple +// build file style (called "bare style") used in this build.sbt file. Most +// intermediate Scala projects make use of so-called "multi-project" builds. A +// multi-project build makes it possible to have different folders which sbt can +// be configured differently for. That is, you may wish to have different +// dependencies or different testing frameworks defined for different parts of +// your codebase. Multi-project builds make this possible. + +// Here's a quick glimpse of what a multi-project build looks like for this +// build, with only one "subproject" defined, called `root`: + +// lazy val root = (project in file(".")). +// settings( +// inThisBuild(List( +// organization := "ch.epfl.scala", +// scalaVersion := "2.13.3" +// )), +// name := "hello-world" +// ) + +// To learn more about multi-project builds, head over to the official sbt +// documentation at http://www.scala-sbt.org/documentation.html + +val cpgVersion = "1.3.121" + +libraryDependencies += "ch.qos.logback" % "logback-classic" % "1.2.3" % Runtime + +libraryDependencies += "io.shiftleft" % "overflowdb-tinkerpop3" % "1.11" + +libraryDependencies += "io.shiftleft" %% "codepropertygraph" % cpgVersion +libraryDependencies += "io.shiftleft" %% "semanticcpg" % cpgVersion +libraryDependencies += "io.shiftleft" %% "dataflowengineoss" % cpgVersion + +libraryDependencies += "com.fasterxml.jackson.core" % "jackson-databind" % "2.2.2" +libraryDependencies += "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.12.2" +libraryDependencies += "com.github.scopt" %% "scopt" % "4.0.0" + +enablePlugins(JavaAppPackaging) + +scalacOptions := List("-encoding", "utf8", "-Xfatal-warnings", "-deprecation", "-unchecked") diff --git a/project/build.properties b/project/build.properties new file mode 100644 index 0000000..dbae93b --- /dev/null +++ b/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.4.9 diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 0000000..550788d --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1 @@ +addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.8.1") \ No newline at end of file diff --git a/res/graph.png b/res/graph.png new file mode 100644 index 0000000..b18c2ed Binary files /dev/null and b/res/graph.png differ diff --git a/scripts/joern-json-csv.py b/scripts/joern-json-csv.py new file mode 100755 index 0000000..f52f24b --- /dev/null +++ b/scripts/joern-json-csv.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +import sys +import os +import json +import pandas as pd + + +def joern_json_csv(path): + base = os.path.dirname(path) + csv_dir = os.path.join(base, 'csv') + if not os.path.exists(csv_dir): + os.mkdir(csv_dir) + + j = json.load(open(path)) + vertex = pd.DataFrame(j['nodes']) + edge = pd.DataFrame(j['edges']) + d = {'nodes': []} + for label in vertex['TYPE'].unique(): + df = vertex[vertex['TYPE'] == label] + df = df.dropna(how='all', axis=1) + df = df.rename(columns={'ID': f'{label}:ID', 'TYPE': ':LABEL'}) + f = os.path.join(csv_dir, f'vertex_{label}.csv') + d['nodes'].append(f) + df.to_csv(f, index=False) + + edge = edge.dropna(how='all', axis=1) + edge = edge.rename( + columns={'outV': ':END_ID', 'inV': ':START_ID', 'TYPE': ':TYPE'}) + f = os.path.join(csv_dir, 'edge.csv') + d['relationships'] = f + edge.to_csv(f, index=False) + + return d + + +if __name__ == '__main__': + joern_json_csv(sys.argv[1]) diff --git a/src/main/scala/Main.scala b/src/main/scala/Main.scala new file mode 100644 index 0000000..a44dad1 --- /dev/null +++ b/src/main/scala/Main.scala @@ -0,0 +1,181 @@ +import java.io.StringWriter +import java.io.PrintWriter +import java.io.File +import scala.jdk.CollectionConverters._ +import com.fasterxml.jackson.databind.json.JsonMapper +import com.fasterxml.jackson.databind.DeserializationFeature +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import overflowdb.{Config, Edge, Node} +import io.shiftleft.codepropertygraph.cpgloading.{CpgLoader, CpgLoaderConfig} +import io.shiftleft.codepropertygraph.Cpg +import io.shiftleft.semanticcpg.layers.{LayerCreator, LayerCreatorContext, Scpg} +import io.shiftleft.dataflowengineoss.layers.dataflows.OssDataFlow +import io.shiftleft.dataflowengineoss.layers.dataflows.OssDataFlowOptions +import org.slf4j.{Logger, LoggerFactory} +import scopt.OParser +import overflowdb.traversal._ + + +case class Args( + loadDefaultOverlays: Boolean = true, + loadOssDataflowOverlay: Boolean = true, + loadCpg: File = new File("."), + json: File = new File("."), + ) + + +object Main { + private val logger: Logger = LoggerFactory.getLogger(Main.getClass) + + def convertNode(node: Node) : Map[String, Object] = { + node.propertyMap.asScala.addAll(List( + ("TYPE", node.label), + ("ID", node.id().asInstanceOf[Object]) + )).toMap + } + + def convertEdge(edge: Edge) : Map[String, Object] = { + edge.propertyMap.asScala.addAll(List( + ("TYPE", edge.label), + ("outV", edge.inNode.id()), + ("inV", edge.outNode.id()) + )).toMap + } + + // source: io.shiftleft.codepropertygraph:console/src/main/scala/io/shiftleft/console/Console.scala + def applyDefaultOverlays(cpg: Cpg): Cpg = { + val appliedOverlays = io.shiftleft.semanticcpg.Overlays.appliedOverlays(cpg) + if (appliedOverlays.isEmpty && !(new Scpg().probe(cpg))) { + logger.info("Adding default overlays...") + val overlayCreators = List(new Scpg) + _runAnalyzer(cpg, overlayCreators: _*) + } + cpg + } + + // source: io.shiftleft.codepropertygraph:console/src/main/scala/io/shiftleft/console/Console.scala + def _runAnalyzer(cpg: Cpg, overlayCreators: LayerCreator*): Cpg = { + overlayCreators.foreach { creator => + runCreator(cpg, creator) + } + cpg + } + + // source: io.shiftleft.codepropertygraph:console/src/main/scala/io/shiftleft/console/Console.scala + protected def runCreator(cpg: Cpg, creator: LayerCreator): Unit = { + logger.info(s"Applying overlay: ${creator.overlayName}") + val context = new LayerCreatorContext(cpg) + creator.run(context) + } + + def loadCpg(cpgFilename: File): Cpg = { + // source: io.shiftleft.codepropertygraph:console/src/main/scala/io/shiftleft/console/Console.scala + val odbConfig = Config.withDefaults.withStorageLocation(cpgFilename.toString) + val config = CpgLoaderConfig.withDefaults.doNotCreateIndexesOnLoad.withOverflowConfig(odbConfig) + val cpg = CpgLoader.loadFromOverflowDb(config) + CpgLoader.createIndexes(cpg) + cpg + } + + def runOssDataflow(cpg: Cpg): Unit = { + logger.info("Applying oss dataflow overlay...") + val context = new LayerCreatorContext(cpg) + val opts = new OssDataFlowOptions() + new OssDataFlow(opts).run(context) + } + + def exportCpg(configArgs: Args) : Unit = { + val cpgFilename = configArgs.loadCpg + logger.info(s"Load cpg from file: ${cpgFilename}") + val cpg = loadCpg(cpgFilename) + + if (cpg.graph.E.asScala.hasLabel("AST").count.next() == 0) { + logger.error("Loaded graph does not contain any AST edges; Exit") + sys.exit(1) + } + + val has_default_overlay = cpg.graph.E.asScala.hasLabel("CDG").count.next() != 0 + val has_oss_dataflow = cpg.graph.E.asScala.hasLabel("REACHING_DEF").count.next() != 0 + + if (configArgs.loadDefaultOverlays) { + if (!has_default_overlay) { + applyDefaultOverlays(cpg) + // creates edges: CDG, ALIAS_OF, CONTAINS, DOMINATE, EVAL_TYPE, SOURCEFILE, POST_DOMINATE, PARAMETER_LINK + } else { + logger.info("Graph seems to have default overlays already applied; Nothing to do") + } + } else { + logger.info("Default overlays will not be applied") + } + + if (configArgs.loadOssDataflowOverlay) { + if (!has_oss_dataflow) { + if (has_default_overlay) { + runOssDataflow(cpg) + // creates edges: REACHING_DEF + } else { + logger.warn("oss dataflow overlay depends on default overlays; skip oss dataflow overlay") + } + } else { + logger.info("Graph seems to have the oss dataflow overlay already applied; Nothing to do") + } + } else { + logger.info("oss dataflow overlay will not be applied") + } + + + // use gremlin to iterate edges and nodes + logger.info(s"Export graph to json file: ${configArgs.json.toString}") + val g = cpg.graph; + val nodes = g.V.asScala.toList.map(convertNode) + val edges = g.E.asScala.toList.map(convertEdge) + val m = Map("nodes" -> nodes, "edges" -> edges) + + val mapper = JsonMapper.builder() + .addModule(DefaultScalaModule) + .build() + mapper.registerModule(DefaultScalaModule) + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) + + val out = new StringWriter + mapper.writerWithDefaultPrettyPrinter().writeValue(out, m) + val json = out.toString + new PrintWriter(configArgs.json) { write(json); close() } + + cpg.close() + } + + def main(args: Array[String]) : Unit = { + val builder = OParser.builder[Args] + val parser1 = { + import builder._ + OParser.sequence( + programName("joernCpgExport"), + opt[Unit]('d', "no-default-overlays") + .action((_, c) => c.copy(loadDefaultOverlays = false)) + .text("do not apply default overlays"), + opt[Unit]('o', "no-oss-dataflow") + .action((_, c) => c.copy(loadOssDataflowOverlay = false)) + .text("do not apply oss dataflow overlay"), + opt[File]('c', "cpg") + .valueName("") + .required() + .action((x, c) => c.copy(loadCpg = x)) + .text("load cpg from OverflowDB"), + opt[File]('j', "json") + .required() + .valueName("") + .action((x, c) => c.copy(json = x)) + .text("export cpg as json file"), + help("help") text("prints this usage text"), + ) + } + + OParser.parse(parser1, args, Args()) match { + case Some(config) => + exportCpg(config) + case _ => + sys.exit(1) + } + } +} \ No newline at end of file