Merge branch 'release-1.3.2'

olehmberg · Jun 11, 2019 · 35e88f6 · 35e88f6
2 parents 7f85c65 + 7b3e59a
commit 35e88f6
Show file tree

Hide file tree

Showing 105 changed files with 1,125 additions and 1,405 deletions.
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@ You can include the WInte.r framework via the following Maven dependency:
 <dependency>
 	<groupId>de.uni_mannheim.informatik.dws</groupId>
 	<artifactId>winter</artifactId>
-	<version>1.3.1</version>
+	<version>1.3.2</version>
 </dependency>
 ```
 

diff --git a/winter-extensions/winter-metanome/metanome_integration/pom.xml b/winter-extensions/winter-metanome/metanome_integration/pom.xml
@@ -4,7 +4,7 @@
 
   <groupId>de.uni_mannheim.informatik.dws.winter</groupId>
   <artifactId>metanome_integration</artifactId>
-  <version>1.0</version>
+  <version>1.1</version>
   <packaging>jar</packaging>
 
   <name>metanome_integration</name>
@@ -43,13 +43,13 @@
     <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
-      <version>3.8.1</version>
+      <version>4.12</version>
       <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>de.uni_mannheim.informatik.dws</groupId>
       <artifactId>winter</artifactId>
-      <version>1.3</version>
+      <version>1.3.2</version>
     </dependency>
 		<dependency>
 			<!-- use this option when building: -Xss10M -->
@@ -62,11 +62,6 @@
 			<artifactId>algorithm_integration</artifactId>
 			<version>1.1-SNAPSHOT</version>
 		</dependency>
-    <dependency>
-      <groupId>de.metanome.algorithms.tane</groupId>
-      <artifactId>TANE-approximate</artifactId>
-      <version>1.0</version>
-    </dependency>
     <dependency>
 			<groupId>de.metanome.algorithms.hyfd</groupId>
 			<artifactId>HyFD</artifactId>

diff --git a/...n/java/de/uni_mannheim/informatik/dws/winter/webtables/FunctionalDependencyDiscovery.java b/...n/java/de/uni_mannheim/informatik/dws/winter/webtables/FunctionalDependencyDiscovery.java
@@ -11,8 +11,11 @@
  */
 package de.uni_mannheim.informatik.dws.winter.webtables;
 
-
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
 import java.io.File;
+import java.io.FileWriter;
+import java.io.InputStreamReader;
 import java.io.PrintStream;
 import java.util.Collection;
 import java.util.HashMap;
@@ -23,15 +26,18 @@
 import java.util.Map;
 import java.util.Set;
 
+// import de.hpi.isg.pyro.algorithms.Pyro;
 import de.metanome.algorithm_integration.AlgorithmExecutionException;
 import de.metanome.algorithm_integration.ColumnIdentifier;
 import de.metanome.algorithm_integration.input.RelationalInputGenerator;
 import de.metanome.algorithm_integration.result_receiver.ColumnNameMismatchException;
 import de.metanome.algorithm_integration.result_receiver.CouldNotReceiveResultException;
 import de.metanome.algorithm_integration.result_receiver.FunctionalDependencyResultReceiver;
+// import de.metanome.algorithm_integration.result_receiver.UniqueColumnCombinationResultReceiver;
 import de.metanome.algorithm_integration.results.FunctionalDependency;
+// import de.metanome.algorithm_integration.results.UniqueColumnCombination;
 import de.metanome.algorithms.hyfd.HyFD;
-import de.metanome.algorithms.tane.TaneAlgorithm;
+// import de.metanome.algorithms.tane.TaneAlgorithm;
 import de.uni_mannheim.informatik.dws.winter.model.Pair;
 import de.uni_mannheim.informatik.dws.winter.utils.StringUtils;
 import de.uni_mannheim.informatik.dws.winter.utils.query.Q;
@@ -217,102 +223,149 @@ public static void calculcateFunctionalDependencies(Collection<Table> tables, Fi
 
 	}
 
-public static void calculateApproximateFunctionalDependencies(Collection<Table> tables, File csvLocation, double errorThreshold) throws Exception {
-		PrintStream tmp = new PrintStream(new File("TANE.out"));
-		final PrintStream out = System.out;
-
-		try {
-			// calculate functional dependencies
-			CSVTableWriter csvWriter = new CSVTableWriter();
-			for(Table t : tables) {
-				out.println(String.format("[calculateApproximateFunctionalDependencies] calculating functional dependencies for table #%d %s {%s}", 
-						t.getTableId(),
-						t.getPath(),
-						StringUtils.join(Q.project(t.getColumns(), new TableColumn.ColumnHeaderProjection()), ",")));
-
-				File tableAsCsv = csvWriter.write(t, new File(csvLocation, t.getPath()));
-
-				System.setOut(tmp);
-
-				Map<Set<TableColumn>, Set<TableColumn>> fds = calculateApproximateFunctionalDependencies(t, tableAsCsv, errorThreshold);
-				t.getSchema().setFunctionalDependencies(fds);
-				Set<Set<TableColumn>> candidateKeys = listCandidateKeys(t);
-
-
-
-				if(candidateKeys.size()==0) {
-					candidateKeys.add(new HashSet<>(t.getColumns()));
+	public static File taneRoot = null;
+
+	public static void calculateApproximateFunctionalDependencies(Collection<Table> tables, File csvLocation, double errorThreshold) throws Exception {
+		CSVTableWriter csvWriter = new CSVTableWriter();
+		File taneDataLocation = new File(taneRoot, "original");
+		File taneDescriptionLocation = new File(taneRoot, "descriptions");
+		// File taneExec = new File(taneLocation, "bin/taneg3");
+		// File tanePrepare = new File(taneLocation, "bin/select.perl");
+		for(Table t : tables) {
+			System.out.println(String.format("[calculateApproximateFunctionalDependencies] calculating functional dependencies for table #%d %s {%s}", 
+					t.getTableId(),
+					t.getPath(),
+					StringUtils.join(Q.project(t.getColumns(), new TableColumn.ColumnHeaderProjection()), ",")));
+
+			// write file
+			// File tableAsCsv = csvWriter.write(t, new File(taneDataLocation, t.getPath()));
+			File tableAsCsv = new File(taneDataLocation, t.getPath());
+			BufferedWriter w = new BufferedWriter(new FileWriter(tableAsCsv));
+			for(TableRow r : t.getRows()) {
+				Object[] values = r.getValueArray();
+				for(int i = 0; i < values.length; i++) {
+					Object o = values[i];
+					if(i>0) {
+						w.write(",");
+					}
+					if(o!=null) {
+						w.write(o.toString().replace(",", ""));
+					}
 				}
-				t.getSchema().setCandidateKeys(candidateKeys);
+				w.write("\n");
 			}
-		} catch(AlgorithmExecutionException e) {
-			throw new Exception(e.getMessage());
-		} finally {
-			System.setOut(out);
-		}
-
-	}
+			w.close();
+
+			// write description
+			String descriptionFileName = t.getPath() + ".dsc";
+			File description = new File(taneDescriptionLocation, descriptionFileName);
+			w = new BufferedWriter(new FileWriter(description));
+			w.write("Umask = 007\n");
+			w.write(String.format("DataIn = ../original/%s\n", tableAsCsv.getName()));
+			w.write("RemoveDuplicates = OFF\nAttributesOut = $BASENAME.atr\nStandardOut = ../data/$BASENAME.dat\nSavnikFlachOut = ../data/$BASENAME.rel\nNOOFDUPLICATES=1\n");
+			w.close();
+
+			// prepare dataset
+			String cmd = "../bin/select.perl ../descriptions/" + descriptionFileName;
+			System.out.println(String.format("%s$ %s", taneDataLocation.getAbsolutePath(), cmd));
+			Process p = Runtime.getRuntime().exec(cmd, null, taneDataLocation);
+			String line = null;
+			BufferedReader r = null;
+			r = new BufferedReader(new InputStreamReader(p.getInputStream()));
+			while((line = r.readLine()) != null) {
+				System.out.println(line);
+			}
+			r.close();
+			r = new BufferedReader(new InputStreamReader(p.getErrorStream()));
+			while((line = r.readLine()) != null) {
+				System.out.println(line);
+			}
+			r.close();
+
+			// run tane
+			String nameWithoutExtension = t.getPath().replaceAll("\\..{3,4}$", "");
+			File dataLocation = new File(taneRoot, "data/" + nameWithoutExtension + ".dat");
+			cmd = String.format("./bin/taneg3 11 %d %d %s %f", t.getRows().size(), t.getColumns().size(), dataLocation.getAbsolutePath(), errorThreshold);
+			System.out.println(String.format("%s$ %s", taneRoot.getAbsolutePath(), cmd));
+			p = Runtime.getRuntime().exec(cmd, null, taneRoot);
+
+			Map<Set<TableColumn>, Set<TableColumn>> functionalDependencies = new HashMap<>();
+			Set<Set<TableColumn>> keys = new HashSet<>();
+			r = new BufferedReader(new InputStreamReader(p.getInputStream()));
+			while((line = r.readLine()) != null) {
+				System.out.println(line);
+				// FDs lines always start with a number or ->
+				String[] values = line.split("\\s");
+				boolean isFdLine = false;
+
+				if(line.startsWith("->")) {
+					isFdLine = true;
+				} else {
+					try {
+						Integer.parseInt(values[0]);
+						isFdLine = true;
+					} catch(NumberFormatException ex) { isFdLine = false; }
+				}
 
-	public static Map<Set<TableColumn>, Set<TableColumn>> calculateApproximateFunctionalDependencies(final Table t, File tableAsCsv, double errorThreshold) throws Exception {
-		TaneAlgorithm tane = new TaneAlgorithm();
-		tane.setErrorThreshold(errorThreshold);
-
-		final Map<Set<TableColumn>, Set<TableColumn>> functionalDependencies = new HashMap<>();
-
-		try {
-			RelationalInputGenerator input = new WebTableFileInputGenerator(tableAsCsv);
-			tane.setRelationalInputConfigurationValue(TaneAlgorithm.INPUT_TAG, input);
-			tane.setResultReceiver(new FunctionalDependencyResultReceiver() {
-
-				@Override
-				public void receiveResult(FunctionalDependency arg0)
-						throws CouldNotReceiveResultException, ColumnNameMismatchException {
-
-					synchronized (this) {
-
-
+				if(isFdLine) {
 					Set<TableColumn> det = new HashSet<>();
-
-					// identify determinant
-					for(ColumnIdentifier ci : arg0.getDeterminant().getColumnIdentifiers()) {						    		
-						Integer colIdx = Integer.parseInt(ci.getColumnIdentifier());
-
-						det.add(t.getSchema().get(colIdx));
+					TableColumn dep = null;
+
+					boolean depStart = false;
+					for(int i = 0; i < values.length; i++) {
+						if(depStart) {
+							int idx = Integer.parseInt(values[i]) - 1;
+							dep = t.getSchema().get(idx);
+							break;
+						} else {
+							if("->".equals(values[i])) {
+								depStart = true;
+							} else {
+								int idx = Integer.parseInt(values[i]) - 1;
+								det.add(t.getSchema().get(idx));
+							}
+						}
 					}
 
-					// add dependant
-					Set<TableColumn> dep = null;
+					Set<TableColumn> mergedDep = null;
 					// check if we already have a dependency with the same determinant
 					if(functionalDependencies.containsKey(det)) {
 						// if so, we add the dependent to the existing dependency
-						dep = functionalDependencies.get(det);
+						mergedDep = functionalDependencies.get(det);
 					} 
-					if(dep==null) {
+					if(mergedDep==null) {
 						// otherwise, we create a new dependency
-						dep = new HashSet<>();
-						functionalDependencies.put(det, dep);
+						mergedDep = new HashSet<>();
+						functionalDependencies.put(det, mergedDep);
 					}
-					Integer colIdx = Integer.parseInt(arg0.getDependant().getColumnIdentifier());
-					dep.add(t.getSchema().get(colIdx));
+					mergedDep.add(dep);
 
+					System.out.println(String.format("{%s}->{%s}",
+						StringUtils.join(Q.project(det, (c)->c.getHeader()), ","),
+						StringUtils.join(Q.project(mergedDep, (c)->c.getHeader()), ",")
+					));
+
+					if(line.contains("key")) {
+						keys.add(det);
 					}
 				}
-
-				@Override
-				public Boolean acceptedResult(FunctionalDependency arg0) {
-					return true;
-				}
-			});
+			}
+			r.close();
+			r = new BufferedReader(new InputStreamReader(p.getErrorStream()));
+			while((line = r.readLine()) != null) {
+				System.out.println(line);
+			}
+			r.close();
 
-			tane.execute();
-		} catch(AlgorithmExecutionException e) {
-			throw new Exception(e.getMessage());
+			t.getSchema().setFunctionalDependencies(functionalDependencies);
+			t.getSchema().setCandidateKeys(keys);
 		}
-
-		return functionalDependencies;
+	}
+	public static Map<Set<TableColumn>, Set<TableColumn>> calculateFunctionalDependencies(final Table t, File tableAsCsv) throws Exception {
+		return calculateFunctionalDependencies(t, tableAsCsv, null);
 	}
 
-public static Map<Set<TableColumn>, Set<TableColumn>> calculateFunctionalDependencies(final Table t, File tableAsCsv) throws Exception {
+	public static Map<Set<TableColumn>, Set<TableColumn>> calculateFunctionalDependencies(final Table t, File tableAsCsv, final Set<Pair<Set<TableColumn>, Set<TableColumn>>> fds) throws Exception {
 		HyFD dep = new HyFD();
 		dep.setBooleanConfigurationValue(HyFD.Identifier.VALIDATE_PARALLEL.name(), true);
 		final Map<Set<TableColumn>, Set<TableColumn>> functionalDependencies = new HashMap<>();
@@ -327,32 +380,32 @@ public void receiveResult(FunctionalDependency arg0)
 						throws CouldNotReceiveResultException, ColumnNameMismatchException {
 
 					synchronized (this) {
+						Set<TableColumn> det = new HashSet<>();
 
+						// identify determinant
+						for(ColumnIdentifier ci : arg0.getDeterminant().getColumnIdentifiers()) {						    		
+							Integer colIdx = Integer.parseInt(ci.getColumnIdentifier());
 
-					Set<TableColumn> det = new HashSet<>();
-
-					// identify determinant
-					for(ColumnIdentifier ci : arg0.getDeterminant().getColumnIdentifiers()) {						    		
-						Integer colIdx = Integer.parseInt(ci.getColumnIdentifier());
-
-						det.add(t.getSchema().get(colIdx));
-					}
+							det.add(t.getSchema().get(colIdx));
+						}
 
-					// add dependant
-					Set<TableColumn> dep = null;
-					// check if we already have a dependency with the same determinant
-					if(functionalDependencies.containsKey(det)) {
-						// if so, we add the dependent to the existing dependency
-						dep = functionalDependencies.get(det);
-					} 
-					if(dep==null) {
-						// otherwise, we create a new dependency
-						dep = new HashSet<>();
-						functionalDependencies.put(det, dep);
-					}
-					Integer colIdx = Integer.parseInt(arg0.getDependant().getColumnIdentifier());
-					dep.add(t.getSchema().get(colIdx));
-
+						// add dependant
+						Set<TableColumn> dep = null;
+						// check if we already have a dependency with the same determinant
+						if(functionalDependencies.containsKey(det)) {
+							// if so, we add the dependent to the existing dependency
+							dep = functionalDependencies.get(det);
+						} 
+						if(dep==null) {
+							// otherwise, we create a new dependency
+							dep = new HashSet<>();
+							functionalDependencies.put(det, dep);
+						}
+						Integer colIdx = Integer.parseInt(arg0.getDependant().getColumnIdentifier());
+						dep.add(t.getSchema().get(colIdx));
+						if(fds!=null) {
+							fds.add(new Pair<>(det, dep));
+						}
 					}
 				}