-
Notifications
You must be signed in to change notification settings - Fork 168
Handle NessieConflictException in Delta clients + handling of multiple-branches
#1249
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,8 @@ | |
| */ | ||
| package org.projectnessie.deltalake; | ||
|
|
||
| import static org.projectnessie.client.NessieConfigConstants.CONF_NESSIE_REF; | ||
|
|
||
| import java.io.File; | ||
| import java.util.ArrayList; | ||
| import java.util.Arrays; | ||
|
|
@@ -32,8 +34,12 @@ | |
| import org.junit.jupiter.api.Assertions; | ||
| import org.junit.jupiter.api.BeforeAll; | ||
| import org.junit.jupiter.api.Test; | ||
| import org.junit.jupiter.api.condition.DisabledIfSystemProperty; | ||
| import org.junit.jupiter.api.io.TempDir; | ||
| import org.projectnessie.client.tests.AbstractSparkTest; | ||
| import org.projectnessie.model.Branch; | ||
| import org.projectnessie.model.ImmutableMerge; | ||
| import org.projectnessie.model.Reference; | ||
|
|
||
| import io.delta.tables.DeltaTable; | ||
| import scala.Tuple2; | ||
|
|
@@ -51,6 +57,91 @@ protected static void createDelta() { | |
| .set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"); | ||
| } | ||
|
|
||
| @Test | ||
| // Delta < 0.8 w/ Spark 2.x doesn't support multiple branches well (warnings when changing the configuration) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for my benefit can you explain a bit more? If i recall correctly
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's delta/spark2 - it complains with a warning that the configuration has changed and "might not have been applied completely". |
||
| @DisabledIfSystemProperty(named = "skip-multi-branch-tests", matches = "true") | ||
| void testMultipleBranches() throws Exception { | ||
| String csvSalaries1 = ITDeltaLog.class.getResource("/salaries1.csv").getPath(); | ||
| String csvSalaries2 = ITDeltaLog.class.getResource("/salaries2.csv").getPath(); | ||
| String csvSalaries3 = ITDeltaLog.class.getResource("/salaries3.csv").getPath(); | ||
| String pathSalaries = new File(tempPath, "salaries").getAbsolutePath(); | ||
|
|
||
| spark.sql(String.format("CREATE TABLE IF NOT EXISTS test_multiple_branches (Season STRING, Team STRING, Salary STRING, " | ||
| + "Player STRING) USING delta LOCATION '%s'", pathSalaries)); | ||
| Dataset<Row> salariesDf1 = spark.read().option("header", true).csv(csvSalaries1); | ||
| salariesDf1.write().format("delta").mode("overwrite").save(pathSalaries); | ||
|
|
||
| Dataset<Row> count1 = spark.sql("SELECT COUNT(*) FROM test_multiple_branches"); | ||
| Assertions.assertEquals(15L, count1.collectAsList().get(0).getLong(0)); | ||
|
|
||
| Reference mainBranch = nessieClient.getTreeApi().getReferenceByName("main"); | ||
|
|
||
| Reference devBranch = nessieClient.getTreeApi().createReference(Branch.of("testMultipleBranches", mainBranch.getHash())); | ||
|
|
||
| spark.sparkContext().getConf().set("spark.hadoop." + CONF_NESSIE_REF, devBranch.getName()); | ||
| spark.sparkContext().hadoopConfiguration().set(CONF_NESSIE_REF, devBranch.getName()); | ||
|
|
||
| Dataset<Row> salariesDf2 = spark.read().option("header", true).csv(csvSalaries2); | ||
| salariesDf2.write().format("delta").mode("append").save(pathSalaries); | ||
|
|
||
| Dataset<Row> count2 = spark.sql("SELECT COUNT(*) FROM test_multiple_branches"); | ||
| Assertions.assertEquals(30L, count2.collectAsList().get(0).getLong(0)); | ||
|
|
||
| spark.sparkContext().getConf().set("spark.hadoop.nessie.ref", "main"); | ||
| spark.sparkContext().hadoopConfiguration().set("nessie.ref", "main"); | ||
|
|
||
| Dataset<Row> salariesDf3 = spark.read().option("header", true).csv(csvSalaries3); | ||
| salariesDf3.write().format("delta").mode("append").save(pathSalaries); | ||
|
|
||
| Dataset<Row> count3 = spark.sql("SELECT COUNT(*) FROM test_multiple_branches"); | ||
| Assertions.assertEquals(35L, count3.collectAsList().get(0).getLong(0)); | ||
| } | ||
|
|
||
| @Test | ||
| // Delta < 0.8 w/ Spark 2.x doesn't support multiple branches well (warnings when changing the configuration) | ||
| @DisabledIfSystemProperty(named = "skip-multi-branch-tests", matches = "true") | ||
| void testCommitRetry() throws Exception { | ||
| String csvSalaries1 = ITDeltaLog.class.getResource("/salaries1.csv").getPath(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any reason to use csvs when everywhere else in spark tests we create simple few row in memory datasets?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A mixture of "laziness and reproduction of the original issue". I wanted this test to be close to what the demo in |
||
| String csvSalaries2 = ITDeltaLog.class.getResource("/salaries2.csv").getPath(); | ||
| String csvSalaries3 = ITDeltaLog.class.getResource("/salaries3.csv").getPath(); | ||
| String pathSalaries = new File(tempPath, "salaries").getAbsolutePath(); | ||
|
|
||
| spark.sql(String.format("CREATE TABLE IF NOT EXISTS test_commit_retry (Season STRING, Team STRING, Salary STRING, " | ||
| + "Player STRING) USING delta LOCATION '%s'", pathSalaries)); | ||
| Dataset<Row> salariesDf1 = spark.read().option("header", true).csv(csvSalaries1); | ||
| salariesDf1.write().format("delta").mode("overwrite").save(pathSalaries); | ||
|
|
||
| Dataset<Row> count1 = spark.sql("SELECT COUNT(*) FROM test_commit_retry"); | ||
| Assertions.assertEquals(15L, count1.collectAsList().get(0).getLong(0)); | ||
|
|
||
| Reference mainBranch = nessieClient.getTreeApi().getReferenceByName("main"); | ||
|
|
||
| Reference devBranch = nessieClient.getTreeApi().createReference(Branch.of("testCommitRetry", mainBranch.getHash())); | ||
|
|
||
| spark.sparkContext().getConf().set("spark.hadoop." + CONF_NESSIE_REF, devBranch.getName()); | ||
| spark.sparkContext().hadoopConfiguration().set(CONF_NESSIE_REF, devBranch.getName()); | ||
|
|
||
| Dataset<Row> salariesDf2 = spark.read().option("header", true).csv(csvSalaries2); | ||
| salariesDf2.write().format("delta").mode("append").save(pathSalaries); | ||
|
|
||
| Dataset<Row> count2 = spark.sql("SELECT COUNT(*) FROM test_commit_retry"); | ||
| Assertions.assertEquals(30L, count2.collectAsList().get(0).getLong(0)); | ||
|
|
||
| String toHash = nessieClient.getTreeApi().getReferenceByName("main").getHash(); | ||
| String fromHash = nessieClient.getTreeApi().getReferenceByName("testCommitRetry").getHash(); | ||
|
|
||
| nessieClient.getTreeApi().mergeRefIntoBranch("main", toHash, ImmutableMerge.builder().fromHash(fromHash).build()); | ||
|
|
||
| spark.sparkContext().getConf().set("spark.hadoop.nessie.ref", "main"); | ||
| spark.sparkContext().hadoopConfiguration().set("nessie.ref", "main"); | ||
|
|
||
| Dataset<Row> salariesDf3 = spark.read().option("header", true).csv(csvSalaries3); | ||
| salariesDf3.write().format("delta").mode("append").save(pathSalaries); | ||
|
|
||
| Dataset<Row> count3 = spark.sql("SELECT COUNT(*) FROM test_commit_retry"); | ||
| Assertions.assertEquals(50L, count3.collectAsList().get(0).getLong(0)); | ||
| } | ||
|
|
||
| @Test | ||
| void testWithoutCondition() { | ||
| Dataset<Row> targetTable = createKVDataSet(Arrays.asList(tuple2(1, 10), tuple2(2, 20), tuple2(3, 30), tuple2(4, 40)), "key", "value"); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| Season,Team,Salary,Player | ||
| 2003-04,Cleveland Cavaliers,$4018920,Lebron James | ||
| 2004-05,Cleveland Cavaliers,$4320360,Lebron James | ||
| 2005-06,Cleveland Cavaliers,$4621800,Lebron James | ||
| 2006-07,Cleveland Cavaliers,$5828090,Lebron James | ||
| 2007-08,Cleveland Cavaliers,$13041250,Lebron James | ||
| 2008-09,Cleveland Cavaliers,$14410581,Lebron James | ||
| 2009-10,Cleveland Cavaliers,$15779912,Lebron James | ||
| 2010-11,Miami Heat,$14500000,Lebron James | ||
| 2011-12,Miami Heat,$16022500,Lebron James | ||
| 2012-13,Miami Heat,$17545000,Lebron James | ||
| 2013-14,Miami Heat,$19067500,Lebron James | ||
| 2014-15,Cleveland Cavaliers,$20644400,Lebron James | ||
| 2015-16,Cleveland Cavaliers,$22971000,Lebron James | ||
| 2016-17,Cleveland Cavaliers,$30963450,Lebron James | ||
| 2017-18,Cleveland Cavaliers,$33285709,Lebron James |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| Season,Team,Salary,Player | ||
| 1984-85,Chicago Bulls,$550000,Michael Jordan | ||
| 1985-86,Chicago Bulls,$630000,Michael Jordan | ||
| 1987-88,Chicago Bulls,$845000,Michael Jordan | ||
| 1988-89,Chicago Bulls,$2000000,Michael Jordan | ||
| 1989-90,Chicago Bulls,$2500000,Michael Jordan | ||
| 1990-91,Chicago Bulls,$2500000,Michael Jordan | ||
| 1991-92,Chicago Bulls,$3250000,Michael Jordan | ||
| 1992-93,Chicago Bulls,$4000000,Michael Jordan | ||
| 1993-94,Chicago Bulls,$4000000,Michael Jordan | ||
| 1994-95,Chicago Bulls,$3850000,Michael Jordan | ||
| 1995-96,Chicago Bulls,$3850000,Michael Jordan | ||
| 1996-97,Chicago Bulls,$30140000,Michael Jordan | ||
| 1997-98,Chicago Bulls,$33140000,Michael Jordan | ||
| 2001-02,Washington Wizards,$1000000,Michael Jordan | ||
| 2002-03,Washington Wizards,$1030000,Michael Jordan |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| Season,Team,Salary,Player | ||
| 1996-97,Los Angeles Lakers,$1015000,Kobe Bryant | ||
| 1997-98,Los Angeles Lakers,$1167240,Kobe Bryant | ||
| 1998-99,Los Angeles Lakers,$1319000,Kobe Bryant | ||
| 1999-00,Los Angeles Lakers,$9000000,Kobe Bryant | ||
| 2000-01,Los Angeles Lakers,$10130000,Kobe Bryant | ||
| 2001-02,Los Angeles Lakers,$11250000,Kobe Bryant | ||
| 2002-03,Los Angeles Lakers,$12375000,Kobe Bryant | ||
| 2003-04,Los Angeles Lakers,$13500000,Kobe Bryant | ||
| 2004-05,Los Angeles Lakers,$14175000,Kobe Bryant | ||
| 2005-06,Los Angeles Lakers,$15946875,Kobe Bryant | ||
| 2006-07,Los Angeles Lakers,$17718750,Kobe Bryant | ||
| 2007-08,Los Angeles Lakers,$19490625,Kobe Bryant | ||
| 2008-09,Los Angeles Lakers,$21262500,Kobe Bryant | ||
| 2009-10,Los Angeles Lakers,$23034375,Kobe Bryant | ||
| 2010-11,Los Angeles Lakers,$24806250,Kobe Bryant | ||
| 2011-12,Los Angeles Lakers,$25244493,Kobe Bryant | ||
| 2012-13,Los Angeles Lakers,$27849149,Kobe Bryant | ||
| 2013-14,Los Angeles Lakers,$30453805,Kobe Bryant | ||
| 2014-15,Los Angeles Lakers,$23500000,Kobe Bryant | ||
| 2015-16,Los Angeles Lakers,$25000000,Kobe Bryant |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why a
javamap and not ascalamap?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just need a mutable map here - and using a Java
HashMapfelt to be the easiest way to go.