diff --git a/maven-projects/spark/graphar-cli/README.md b/maven-projects/spark/graphar-cli/README.md new file mode 100644 index 000000000..73dbb1a0d --- /dev/null +++ b/maven-projects/spark/graphar-cli/README.md @@ -0,0 +1,37 @@ +# GraphAr Cli Tool + +This is a project that depends on the GraphAr spark library. + +## Building + +To build this project, we need to use `Maven` to enable the +`datasource` and `graphar-cli` profiles in the `spark` +directory (that is, the parent directory of the current path). + +```bash + $ git clone https://github.com/apache/incubator-graphar.git + $ cd incubator-graphar + $ cd mavens-projects/spark +``` + + +Build the package: + +```bash + $ mvn clean install -DskipTests -P datasources-32,graphar-cli +``` + +## Running + +The build produces a shaded Jar that can be run using the `spark-submit` command: + +```bash + $ cd graphar-cli/target + $ spark-submit --class org.apache.graphar.cli.Main graphar-cli-0.12.0-SNAPSHOT-shaded.jar +``` + +For a shorter command-line invocation, add an alias to your shell like this: + +``` +alias graphar="spark-submit --class org.apache.graphar.cli.Main /path/to/graphar-cli-0.12.0-SNAPSHOT-shaded.jar" +``` diff --git a/maven-projects/spark/graphar-cli/import.json b/maven-projects/spark/graphar-cli/import.json new file mode 100644 index 000000000..3a1b296f2 --- /dev/null +++ b/maven-projects/spark/graphar-cli/import.json @@ -0,0 +1,102 @@ +{ + "graphar": { + "path": "/tmp/graphar/movie", + "name": "MovieGraph", + "vertexChunkSize": 100, + "edgeChunkSize": 1024, + "fileType": "parquet" + }, + "sourceType": "csv", + "neo4j": { + "url": "bolt://localhost:7687", + "username": "neo4j", + "password": "neo4j" + }, + "schema": { + "vertices": [ + { + "label": "Person", + "primary": "name", + "properties": [ + { + "name": "name", + "type": "string" + }, + { + "name": "born", + "type": "int", + "nullable": true + } + ], + "propertyGroups": [ + [ + "name", + "born" + ] + ], + "source": { + "path": "/tmp/graphar/movie/Person.csv", + "delimiter": ",", + "columns": [ + "name", + "born" + ] + } + }, + { + "label": "Movie", + "primary": "title", + "properties": [ + { + "name": "title", + "type": "string" + }, + { + "name": "tagline", + "type": "string", + "nullable": true + } + ], + "propertyGroups": [ + [ + "title", + "tagline" + ] + ], + "source": { + "path": "/tmp/graphar/movie/Movie.csv", + "delimiter": "," + } + } + ], + "edges": [ + { + "label": "PRODUCED", + "adjListType": "", + "srcLabel": "Person", + "srcProp": "name", + "dstLabel": "Movie", + "dstProp": "title", + "source": { + "path": "/tmp/graphar/movie/Produced.csv", + "delimiter": "," + } + }, + { + "label": "REVIEWED", + "srcLabel": "Person", + "srcProp": "name", + "dstLabel": "Movie", + "dstProp": "title", + "properties": [ + "rating", + "summary" + ], + "source": { + "path": "/tmp/graphar/movie/Reviewed.csv", + "delimiter": "," + } + } + ] + } +} \ No newline at end of file diff --git a/maven-projects/spark/graphar-cli/pom.xml b/maven-projects/spark/graphar-cli/pom.xml new file mode 100644 index 000000000..36b18228a --- /dev/null +++ b/maven-projects/spark/graphar-cli/pom.xml @@ -0,0 +1,234 @@ + + + + + 4.0.0 + + + org.apache.graphar + spark + ${graphar.version} + ../pom.xml + + + graphar-cli + ${graphar.version} + jar + + + 1.7.30 + + + + + info.picocli + picocli + 4.7.6 + + + org.slf4j + slf4j-api + ${slf4j.version} + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + ${deps.scope} + + + log4j + log4j + 1.2.17 + ${deps.scope} + + + org.apache.graphar + graphar-commons + ${graphar.version} + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + provided + + + + + + + org.scala-tools + maven-scala-plugin + 2.15.2 + + ${scala.version} + + -target:jvm-1.8 + + + -Xss4096K + + + + + scala-compile + + compile + + + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + scala-test-compile + + testCompile + + + + + + org.scalatest + scalatest-maven-plugin + 2.0.0 + + + test + + test + + + + + + org.apache.maven.plugins + maven-shade-plugin + 2.1 + + + package + + shade + + + false + true + + + + *:* + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + **/log4j.properties + + + + + + reference.conf + + + + + + + + net.alchim31.maven + scala-maven-plugin + 4.8.0 + + + + compile + testCompile + + + + + + -Xms64m + -Xmx1024m + + + -Ywarn-unused + + + + org.scalameta + semanticdb-scalac_2.12.10 + 4.3.24 + + + + + + io.github.evis + scalafix-maven-plugin_2.13 + 0.1.8_0.11.0 + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + + jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + attach-javadocs + + jar + + + + + + maven-site-plugin + 3.7.1 + + + + \ No newline at end of file diff --git a/maven-projects/spark/graphar-cli/src/main/resources/log4j.properties b/maven-projects/spark/graphar-cli/src/main/resources/log4j.properties new file mode 100644 index 000000000..3eccfe10d --- /dev/null +++ b/maven-projects/spark/graphar-cli/src/main/resources/log4j.properties @@ -0,0 +1,43 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# debug log4j configuration +#log4j.debug=true + +# by default, log anything but cli console to component logger +log4j.rootLogger = WARN, component + +# Set the appender named console to be a ConsoleAppender +log4j.appender.console=org.apache.log4j.ConsoleAppender + +# CLI console output +log4j.logger.org.apache.graphar.cli=INFO, console +log4j.additivity.org.apache.graphar.cli=false + +# Define the layout for console appender +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%m%n + +# Change to turn on component logging +log4j.appender.component=org.apache.log4j.varia.NullAppender + +# Define the layout for component appender +log4j.appender.component.layout=org.apache.log4j.PatternLayout +log4j.appender.component.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p :: %m [%C]%n + diff --git a/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/Main.scala b/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/Main.scala new file mode 100644 index 000000000..5cf6560f5 --- /dev/null +++ b/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/Main.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.graphar.cli + +import org.apache.commons.logging.LogFactory +import org.apache.graphar.cli.commands.{Check, Import, Show} +import org.apache.log4j.PropertyConfigurator +import org.slf4j.{Logger, LoggerFactory} +import picocli.CommandLine +import picocli.CommandLine.Command + +import java.util.concurrent.Callable + +/** + * Main class for the GraphAr Cli Tool. + */ +@Command( + name = "main", + subcommands = Array(classOf[Show], classOf[Check], classOf[Import]), + mixinStandardHelpOptions = true, + description = Array("GraphAr Cli Tool") +) +class Main extends Callable[Int] { + val console: Logger = LoggerFactory.getLogger(classOf[Main]) + def call: Int = { + console.info( + "GraphAr Cli Tool. Use -h or --help to see available commands." + ) + 1 + } +} + +object Main { + def main(args: Array[String]): Unit = { + PropertyConfigurator.configure( + classOf[Main].getResource("/log4j.properties") + ) + LogFactory.getFactory.setAttribute( + "org.apache.commons.logging.Log", + "org.apache.commons.logging.impl.Log4JLogger" + ) + val exitCode = new CommandLine(new Main()).execute(args: _*) + System.exit(exitCode) + } +} diff --git a/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/commands/Check.scala b/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/commands/Check.scala new file mode 100644 index 000000000..096f9f148 --- /dev/null +++ b/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/commands/Check.scala @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.graphar.cli.commands + +import org.apache.graphar.GraphInfo +import org.apache.graphar.cli.Main +import org.apache.spark.sql.SparkSession +import org.slf4j.Logger +import picocli.CommandLine.{Command, Option, ParentCommand} + +import java.util.concurrent.Callable + +/** + * Check the graphar meta information. + */ +@Command( + name = "show", + mixinStandardHelpOptions = true, + description = Array("Check the graphar meta information") +) +class Check extends Callable[Int] { + @ParentCommand + private var parent: Main = _ + + @Option( + names = Array("-p", "--path"), + description = Array( + "The path of the YAML file." + ), + required = true + ) + private var path: String = _ + +// TODO(ljj): Support check all the meta information and files. +// @Option( +// names = Array("-a", "--all"), +// description = Array( +// "Check all the meta information and files." +// ) +// ) +// private var checkALL: Boolean = false + + def call(): Int = { + val console: Logger = parent.console + val spark = SparkSession.builder + .appName("GraphArCli") + .master("local[*]") + .getOrCreate() + + try { + val graphInfo = GraphInfo.loadGraphInfo(path, spark) + val vertexInfos = graphInfo.getVertexInfos() + for ((vertexLabel, vertexInfo) <- vertexInfos) { + if (!vertexInfo.isValidated()) { + console.error(s"VertexInfo ${vertexLabel} is not validated.") + return 0 + } + console.info(s"VertexInfo ${vertexLabel} is validated.") + } + val vertexLabels = vertexInfos.keys.toList + val edgeInfos = graphInfo.getEdgeInfos() + for ((concatKeys, edgeInfo) <- edgeInfos) { + if (!edgeInfo.isValidated()) { + console.error(s"EdgeInfo ${concatKeys} is not validated.") + return 0 + } + console.info(s"EdgeInfo ${concatKeys} is validated.") + val srcVertexLabel = edgeInfo.src_label + if (!vertexLabels.contains(srcVertexLabel)) { + console.error( + s"The src_label ${srcVertexLabel} of the edge ${concatKeys} does not exist." + ) + return 0 + } + val dstVertexLabel = edgeInfo.dst_label + if (!vertexLabels.contains(dstVertexLabel)) { + console.error( + s"The dst_label ${dstVertexLabel} of the edge ${concatKeys} does not exist." + ) + return 0 + } + } + console.info("All vertexInfos and edgeInfos are validated.") + spark.close() + 1 + } catch { + case e: Exception => + console.error(s"Check failed: ${e.getMessage}") + 0 + } + + } +} diff --git a/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/commands/Import.scala b/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/commands/Import.scala new file mode 100644 index 000000000..25a43c779 --- /dev/null +++ b/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/commands/Import.scala @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.graphar.cli.commands + +import org.apache.graphar.cli.Main +import org.apache.graphar.cli.util.Config +import org.apache.graphar.cli.util.Utils.parseStringToSparkType +import org.apache.graphar.graph.GraphWriter +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.types.{StructField, StructType} +import org.json4s.{DefaultFormats, JValue} +import org.json4s.jackson.JsonMethods.parse +import org.slf4j.Logger +import picocli.CommandLine.{Command, Option, ParentCommand} + +import java.util.concurrent.Callable +import scala.io.Source + +/** + * Read the configuration file and import data. + */ +@Command( + name = "import", + mixinStandardHelpOptions = true, + description = Array("Show the GraphAr information.") +) +class Import extends Callable[Int] { + @ParentCommand + private var parent: Main = _ + @Option( + names = Array("-c", "--config"), + description = Array( + "The path of the import config file." + ), + required = true + ) + private var confPath: String = _ + + def call(): Int = { + val console: Logger = parent.console + val source = Source.fromFile(confPath) + val confString = + try source.mkString + finally source.close() + val confJson: JValue = parse(confString) + +// TODO: validate the config file + implicit val formats: DefaultFormats.type = DefaultFormats + val config = (confJson \ "graphar").extract[Config] + console.info("Config read successfully.") + + config.sourceType match { + case "csv" => + val spark = SparkSession.builder + .appName("GraphAr Cli") + .master("local[*]") + .getOrCreate() + val writer: GraphWriter = new GraphWriter() + val schema = config.schema + + for (vertex <- schema.vertices) { + val source = vertex.source + val dfSchema = StructType( + vertex.properties.map(property => + StructField( + property.name, + parseStringToSparkType(property.`type`), + nullable = property.nullable.getOrElse(true) + ) + ) + ) + val vertex_df = spark.read + .option("header", "true") + .schema(dfSchema) + .option("delimiter", source.delimiter) + .csv(source.path.get) + + writer.PutVertexData( + vertex.label, + vertex_df, + primaryKey = vertex.primary + ) + } + + for (edge <- schema.edges) { + val source = edge.source + val srcStructField = StructField( + edge.srcProp, + parseStringToSparkType( + schema.vertices + .find(_.label == edge.srcLabel) + .get + .properties + .find(_.name == edge.srcProp) + .get + .`type` + ) + ) + val dstStructField = StructField( + edge.dstProp, + parseStringToSparkType( + schema.vertices + .find(_.label == edge.dstLabel) + .get + .properties + .find(_.name == edge.dstProp) + .get + .`type` + ) + ) + val dfSchema = StructType( + srcStructField +: + dstStructField +: + edge.properties.map(property => + StructField( + property.name, + parseStringToSparkType(property.`type`), + nullable = property.nullable.getOrElse(true) + ) + ) + ) + val edge_df = spark.read + .option("header", "true") + .schema(dfSchema) + .option("delimiter", source.delimiter) + .csv(source.path.get) + + writer.PutEdgeData( + (edge.srcLabel, edge.label, edge.dstLabel), + edge_df + ) + } + + } + + val spark = SparkSession.builder + .appName("GraphAr Cli") + .master("local[*]") + .getOrCreate() + spark.close() + 1 + } +} diff --git a/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/commands/Show.scala b/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/commands/Show.scala new file mode 100644 index 000000000..f4865efd5 --- /dev/null +++ b/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/commands/Show.scala @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.graphar.cli.commands + +import org.apache.graphar.GraphInfo +import org.apache.graphar.cli.Main +import org.apache.spark.sql.SparkSession +import org.slf4j.Logger +import picocli.CommandLine.{Command, Option, ParentCommand} + +import java.util.concurrent.Callable + +/** + * Show the graphar information. + */ +@Command( + name = "show", + mixinStandardHelpOptions = true, + description = Array("Show the GraphAr information.") +) +class Show extends Callable[Int] { + @ParentCommand + private var parent: Main = _ + + @Option( + names = Array("-p", "--path"), + description = Array( + "The path of the YAML file." + ), + required = true + ) + private var path: String = _ + +// TODO(ljj): Support show more information and more options. + + def call(): Int = { + val console: Logger = parent.console + val spark = SparkSession + .builder() + .enableHiveSupport() + .master("local[*]") + .getOrCreate() + val graphInfo = GraphInfo.loadGraphInfo(path, spark) + console.info(graphInfo.dump()) + spark.close() + 1 + } +} diff --git a/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/util/Config.scala b/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/util/Config.scala new file mode 100644 index 000000000..d6558bc3a --- /dev/null +++ b/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/util/Config.scala @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.graphar.cli.util + +case class GraphAr( + path: String, + name: String, + vertexChunkSize: Int, + edgeChunkSize: Int, + fileType: String +) + +case class Neo4j(url: String, username: String, password: String) + +case class Property( + name: String, + `type`: String, + nullable: Option[Boolean] = None +) + +case class Source( + `type`: String, + path: Option[String] = None, + url: Option[String] = None, + delimiter: String = "," +) + +case class Vertex( + label: String, + primary: String, + properties: List[Property], + propertyGroups: Option[List[List[String]]] = None, + source: Source +) + +case class Edge( + label: String, + adjListType: String = "ordered_by_source", + srcLabel: String, + srcProp: String, + dstLabel: String, + dstProp: String, + properties: List[Property], + propertyGroups: Option[List[List[String]]] = None, + source: Source +) + +case class Schema( + vertices: List[Vertex], + edges: List[Edge] +) + +case class Config( + graphar: GraphAr, + sourceType: String, + schema: Schema, + neo4j: Option[Neo4j] +) diff --git a/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/util/Utils.scala b/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/util/Utils.scala new file mode 100644 index 000000000..2a3175dd2 --- /dev/null +++ b/maven-projects/spark/graphar-cli/src/main/scala/org/apache/graphar/cli/util/Utils.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.graphar.cli.util + +import org.apache.spark.sql.types._ + +object Utils { + def parseStringToSparkType(dataType: String): DataType = { + dataType match { + case "bool" => BooleanType + case "int32" => IntegerType + case "int64" => LongType + case "float" => FloatType + case "double" => DoubleType + case "string" => StringType + case _ => + throw new IllegalArgumentException("Unknown data type: " + dataType) + } + } +} diff --git a/maven-projects/spark/pom.xml b/maven-projects/spark/pom.xml index 455fb1754..54fa50465 100644 --- a/maven-projects/spark/pom.xml +++ b/maven-projects/spark/pom.xml @@ -77,6 +77,12 @@ datasources-33 + + graphar-cli + + graphar-cli + +