Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
qishipengqsp committed Sep 13, 2024
1 parent 594f9e1 commit 1da63f3
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 21 deletions.
5 changes: 4 additions & 1 deletion scripts/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ def run_local(
**({'spark.shuffle.spill.compress': 'true'}),
**({'spark.serializer': 'org.apache.spark.serializer.KryoSerializer'}),
**({'spark.executor.extraJavaOptions': '-XX:+UseG1GC'}),
**({'spark.driver.maxResultSize': '5g'}),
**({'spark.driver.maxResultSize': '0'}),
**({'spark.memory.offHeap.enabled': 'true'}),
**({'spark.memory.offHeap.size': '100g'}),
**({'spark.storage.memoryFraction': '0'}),
# **({'spark.driver.extraJavaOptions': '-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005'}), # Debug
# **({'spark.executor.extraJavaOptions': '-verbose:gc -XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps'}),
**spark_conf
Expand Down
15 changes: 8 additions & 7 deletions scripts/run_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,24 @@ echo "start: " `date`
# --conf "spark.dynamicAllocation.enabled=true" \
# --conf "spark.dynamicAllocation.minExecutors=1" \
# --conf "spark.dynamicAllocation.maxExecutors=10" \

# --conf "spark.yarn.maximizeResourceAllocation=true" \
# --conf "spark.memory.offHeap.enabled=true" \
# --conf "spark.memory.offHeap.size=100g" \
time spark-submit --master spark://finbench-large-00:7077 \
--class ldbc.finbench.datagen.LdbcDatagen \
--num-executors 2 \
--conf "spark.default.parallelism=400" \
--conf "spark.default.parallelism=800" \
--conf "spark.network.timeout=100000" \
--conf "spark.shuffle.compress=true" \
--conf "spark.shuffle.spill.compress=true" \
--conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
--conf "spark.driver.memory=200g" \
--conf "spark.driver.maxResultSize=10g" \
--conf "spark.executor.memory=300g" \
--conf "spark.executor.memoryOverheadFactor=0.2" \
--conf "spark.driver.memory=100g" \
--conf "spark.driver.maxResultSize=0" \
--conf "spark.executor.memory=400g" \
--conf "spark.executor.memoryOverheadFactor=0.5" \
--conf "spark.executor.extraJavaOptions=-XX:+UseG1GC" \
${LDBC_FINBENCH_DATAGEN_JAR} \
--scale-factor 10 \
--scale-factor 100 \
--output-dir ${OUTPUT_DIR}

echo "End: " `date`
24 changes: 21 additions & 3 deletions scripts/run_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,24 @@
LDBC_FINBENCH_DATAGEN_JAR=target/ldbc_finbench_datagen-0.2.0-SNAPSHOT-jar-with-dependencies.jar
OUTPUT_DIR=out

# For more command line arguments, see the main entry for more information at
# src/main/scala/ldbc/finbench/datagen/LdbcDatagen.scala
time python3 scripts/run.py --jar $LDBC_FINBENCH_DATAGEN_JAR --main-class ldbc.finbench.datagen.LdbcDatagen --memory 500g -- --scale-factor 1 --output-dir ${OUTPUT_DIR}
# run locally with the python script
# time python3 scripts/run.py --jar $LDBC_FINBENCH_DATAGEN_JAR --main-class ldbc.finbench.datagen.LdbcDatagen --memory 500g -- --scale-factor 30 --output-dir ${OUTPUT_DIR}

# run locally with spark-submit command
# **({'spark.driver.extraJavaOptions': '-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005'}), # Debug
# **({'spark.executor.extraJavaOptions': '-verbose:gc -XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps'}),
time spark-submit --master local[*] \
--class ldbc.finbench.datagen.LdbcDatagen \
--driver-memory 500g \
--conf "spark.default.parallelism=500" \
--conf "spark.shuffle.compress=true" \
--conf "spark.shuffle.spill.compress=true" \
--conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
--conf "spark.memory.offHeap.enabled=true" \
--conf "spark.memory.offHeap.size=100g" \
--conf "spark.storage.memoryFraction=0" \
--conf "spark.driver.maxResultSize=0" \
--conf "spark.executor.extraJavaOptions=-XX:+UseG1GC" \
${LDBC_FINBENCH_DATAGEN_JAR} \
--scale-factor 30 \
--output-dir ${OUTPUT_DIR}
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,10 @@ object GenerationStage extends DatagenStage with Logging {
)
}

// TODO: It's better to define multiple job groups.
SparkUI.job(
implicitly[ClassTag[ActivitySimulator]].runtimeClass.getSimpleName,
"serialize Finbench data"
) {
val simulator = new ActivitySimulator(
RawSink(args.outputDir, format, args.partitionsOpt)
)
simulator.simulate(config)
}
val simulator = new ActivitySimulator(
RawSink(args.outputDir, format, args.partitionsOpt)
)
simulator.simulate(config)
}

private def buildConfig(args: Args): DatagenConfiguration = {
Expand Down

0 comments on commit 1da63f3

Please sign in to comment.