Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

blocking tree changes #924

Merged
merged 17 commits into from
Jan 3, 2025
Merged
32 changes: 26 additions & 6 deletions assembly/dependency-reduced-pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,32 @@
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-inline</artifactId>
<version>5.2.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>5.2.0</version>
<scope>test</scope>
<exclusions>
<exclusion>
<artifactId>byte-buddy</artifactId>
<groupId>net.bytebuddy</groupId>
</exclusion>
<exclusion>
<artifactId>byte-buddy-agent</artifactId>
<groupId>net.bytebuddy</groupId>
</exclusion>
<exclusion>
<artifactId>objenesis</artifactId>
<groupId>org.objenesis</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
Expand Down Expand Up @@ -113,12 +139,6 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<version>1.8.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId>
Expand Down
54 changes: 11 additions & 43 deletions common/core/src/main/java/zingg/common/core/block/Block.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public abstract class Block<D,R,C,T> implements Serializable {
private static final long serialVersionUID = 1L;

public static final Log LOG = LogFactory.getLog(Block.class);
private final IHashFunctionUtility<D, R, C, T> hashFunctionUtility;

protected ZFrame<D,R,C> dupes;
// Class[] types;
Expand All @@ -30,10 +31,11 @@ public abstract class Block<D,R,C,T> implements Serializable {
protected ListMap<HashFunction<D,R,C,T>, String> childless;

public Block() {

this.hashFunctionUtility = HashFunctionUtilityFactory.getHashFunctionUtility(HashUtility.CACHED);
}

public Block(ZFrame<D,R,C> training, ZFrame<D,R,C> dupes) {
this();
this.training = training;
this.dupes = dupes;
childless = new ListMap<HashFunction<D,R,C,T>, String>();
Expand Down Expand Up @@ -145,7 +147,7 @@ public void estimateElimCount(Canopy<R> c, long elimCount) {
for (HashFunction function : functions) {
// /if (!used.contains(field.getIndex(), function) &&
if (least ==0) break;//how much better can it get?
if (!isFunctionUsed(tree, node, field.fieldName, function) //&&
if (!hashFunctionUtility.isHashFunctionUsed(field, function, tree, node) //&&
//!childless.contains(function, field.fieldName)
)
{
Expand Down Expand Up @@ -231,6 +233,8 @@ public Tree<Canopy<R>> getBlockingTree(Tree<Canopy<R>> tree, Canopy<R>parent,
LOG.debug("Size is bigger ");
Canopy<R>best = getBestNode(tree, parent, node, fieldsOfInterest);
if (best != null) {
//add function, context info for this best node in set
hashFunctionUtility.addHashFunctionIfRequired(best);
if (LOG.isDebugEnabled()) {
LOG.debug(" HashFunction is " + best + " and node is " + node);
}
Expand Down Expand Up @@ -258,6 +262,8 @@ public Tree<Canopy<R>> getBlockingTree(Tree<Canopy<R>> tree, Canopy<R>parent,

getBlockingTree(tree, node, n, fieldsOfInterest);
}
//remove function, context info for this best node as we are returning from best node
hashFunctionUtility.removeHashFunctionIfRequired(best);
}
else {
node.clearBeforeSaving();
Expand All @@ -279,48 +285,10 @@ public Tree<Canopy<R>> getBlockingTree(Tree<Canopy<R>> tree, Canopy<R>parent,
return tree;
}

public boolean checkFunctionInNode(Canopy<R>node, String name,
HashFunction function) {
if (node.getFunction() != null && node.getFunction().equals(function)
&& node.context.fieldName.equals(name)) {
return true;
}
return false;
}
// public boolean isFunctionUsed(FieldDefinition fieldDefinition, HashFunction<D, R, C, T> function) {
// return hashFunctionsInCurrentNodePath.contains(getKey(fieldDefinition, function));
// }

public boolean isFunctionUsed(Tree<Canopy<R>> tree, Canopy<R>node, String fieldName,
HashFunction function) {
// //LOG.debug("Tree " + tree);
// //LOG.debug("Node " + node);
// //LOG.debug("Index " + index);
// //LOG.debug("Function " + function);
boolean isUsed = false;
if (node == null || tree == null)
return false;
if (checkFunctionInNode(node, fieldName, function))
return true;
Tree<Canopy<R>> nodeTree = tree.getTree(node);
if (nodeTree == null)
return false;

Tree<Canopy<R>> parent = nodeTree.getParent();
if (parent != null) {
Canopy<R>head = parent.getHead();
while (head != null) {
// check siblings of node
/*for (Tree<Canopy<R>> siblings : parent.getSubTrees()) {
Canopy<R>sibling = siblings.getHead();
if (checkFunctionInNode(sibling, index, function))
return true;
}*/
// check parent of node
return isFunctionUsed(tree, head, fieldName, function);
}
}
return isUsed;
}


public List<Canopy<R>> getHashSuccessors(Collection<Canopy<R>> successors, Object hash) {
List<Canopy<R>> retCanopy = new ArrayList<Canopy<R>>();
for (Canopy<R>c: successors) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package zingg.common.core.block;

import zingg.common.client.FieldDefinition;
import zingg.common.core.hash.HashFunction;

import java.util.HashSet;
import java.util.Set;

public class CacheBasedHashFunctionUtility<D, R, C, T> implements IHashFunctionUtility<D, R, C, T> {

private final Set<String> hashFunctionsInCurrentNodePath;
private static final String DELIMITER = ":";

public CacheBasedHashFunctionUtility() {
this.hashFunctionsInCurrentNodePath = new HashSet<String>();
}

@Override
public boolean isHashFunctionUsed(FieldDefinition fieldDefinition, HashFunction<D, R, C, T> hashFunction, Tree<Canopy<R>> tree, Canopy<R> node) {
return hashFunctionsInCurrentNodePath.contains(getKey(fieldDefinition, hashFunction));
}

@Override
public void addHashFunctionIfRequired(Canopy<R> node) {
addHashFunctionInCurrentNodePath(node);
}

@Override
public void removeHashFunctionIfRequired(Canopy<R> node) {
removeHashFunctionInCurrentNodePath(node);
}

private void addHashFunctionInCurrentNodePath(Canopy<R> node) {
this.hashFunctionsInCurrentNodePath.add(getKey(node.getContext(), node.getFunction()));
}

private void removeHashFunctionInCurrentNodePath(Canopy<R> node) {
this.hashFunctionsInCurrentNodePath.remove(getKey(node.getContext(), node.getFunction()));
}

private String getKey(FieldDefinition fieldDefinition, HashFunction<D, R, C, T> hashFunction) {
return fieldDefinition.getName() + DELIMITER + hashFunction.getName();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package zingg.common.core.block;

import zingg.common.client.FieldDefinition;
import zingg.common.core.hash.HashFunction;

public class DefaultHashFunctionUtility<D, R, C, T> implements IHashFunctionUtility<D, R, C, T>{
@Override
public boolean isHashFunctionUsed(FieldDefinition fieldDefinition, HashFunction<D, R, C, T> hashFunction, Tree<Canopy<R>> tree, Canopy<R> node) {
boolean isUsed = false;
if (node == null || tree == null) {
return false;
Fixed Show fixed Hide fixed
}
if (checkFunctionInNode(node, fieldDefinition.fieldName, hashFunction)) {
return true;
Fixed Show fixed Hide fixed
}
Tree<Canopy<R>> nodeTree = tree.getTree(node);
if (nodeTree == null) {
return false;
Fixed Show fixed Hide fixed
}

Tree<Canopy<R>> parent = nodeTree.getParent();
if (parent != null) {
Canopy<R>head = parent.getHead();
while (head != null) {
// check siblings of node
/*for (Tree<Canopy<R>> siblings : parent.getSubTrees()) {
Canopy<R>sibling = siblings.getHead();
if (checkFunctionInNode(sibling, index, function))
return true;
}*/
// check parent of node
return isHashFunctionUsed(fieldDefinition, hashFunction, tree, head);

Check warning

Code scanning / PMD

Avoid using a branching statement as the last in a loop. Warning

Avoid using a branching statement as the last in a loop.
}
}
return isUsed;
}

@Override
public void addHashFunctionIfRequired(Canopy<R> node) {
//don't add hashFunction to cache
//as we are in default mode
}

@Override
public void removeHashFunctionIfRequired(Canopy<R> node) {
//don't remove hashFunction from cache
//as we are in default mode
}

private boolean checkFunctionInNode(Canopy<R>node, String name,
HashFunction<D, R, C, T> function) {
return node.getFunction() != null && node.getFunction().equals(function)
&& node.context.fieldName.equals(name);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package zingg.common.core.block;

public class HashFunctionUtilityFactory {

Check warning

Code scanning / PMD

This utility class has a non-private constructor Warning

This utility class has a non-private constructor

public static IHashFunctionUtility getHashFunctionUtility(HashUtility hashUtility) {

if (HashUtility.DEFAULT.equals(hashUtility)) {
return new DefaultHashFunctionUtility();
}
return new CacheBasedHashFunctionUtility();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package zingg.common.core.block;

public enum HashUtility {
DEFAULT,
CACHED
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package zingg.common.core.block;

import zingg.common.client.FieldDefinition;
import zingg.common.core.hash.HashFunction;

public interface IHashFunctionUtility<D, R, C, T> {
boolean isHashFunctionUsed(FieldDefinition fieldDefinition, HashFunction<D, R, C, T> hashFunction, Tree<Canopy<R>> tree, Canopy<R>node);

void addHashFunctionIfRequired(Canopy<R> node);

void removeHashFunctionIfRequired(Canopy<R> node);
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package zingg.common.core.executor;

import java.util.Arrays;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

Expand Down Expand Up @@ -87,7 +85,7 @@ public void execute() throws ZinggClientException {

ZFrame<D,R,C> sample = getStopWords().preprocessForStopWords(sampleOrginal);

Tree<Canopy<R>> tree = getBlockingTreeUtil().createBlockingTree(sample, posPairs, 1, -1, args, getHashUtil().getHashFunctionList());
Tree<Canopy<R>> tree = getBlockingTreeUtil().createBlockingTree(sample, posPairs, 1, -1, args, getHashUtil().getHashFunctionList());
//tree.print(2);
ZFrame<D,R,C> blocked = getBlockingTreeUtil().getBlockHashes(sample, tree);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
public abstract class BlockingTreeUtil<S, D,R,C,T> {

public final Log LOG = LogFactory.getLog(BlockingTreeUtil.class);


private PipeUtilBase<S, D, R, C> pipeUtil;

Expand All @@ -32,8 +31,6 @@ public PipeUtilBase<S, D, R, C> getPipeUtil() {
}




public void setPipeUtil(PipeUtilBase<S, D, R, C> pipeUtil) {
this.pipeUtil = pipeUtil;
}
Expand All @@ -43,10 +40,10 @@ public abstract Block<D,R,C,T> getBlock(ZFrame<D,R,C> sample, ZFrame<D,R,C> posi
ListMap<T, HashFunction<D,R,C,T>>hashFunctions, long blockSize);


public Tree<Canopy<R>> createBlockingTree(ZFrame<D,R,C> testData,
ZFrame<D,R,C> positives, double sampleFraction, long blockSize,
IArguments args,
ListMap<T, HashFunction<D,R,C,T>> hashFunctions) throws Exception, ZinggClientException {
public Tree<Canopy<R>> createBlockingTree(ZFrame<D,R,C> testData,
ZFrame<D,R,C> positives, double sampleFraction, long blockSize,
IArguments args,
ListMap<T, HashFunction<D,R,C,T>> hashFunctions) throws Exception, ZinggClientException {
ZFrame<D,R,C> sample = testData.sample(false, sampleFraction);
sample = sample.cache();
long totalCount = sample.count();
Expand All @@ -68,9 +65,7 @@ public Tree<Canopy<R>> createBlockingTree(ZFrame<D,R,C> testData,
fd.add(def);
}
}

Tree<Canopy<R>> blockingTree = cblock.getBlockingTree(null, null, root,
fd);
Tree<Canopy<R>> blockingTree = cblock.getBlockingTree(null, null, root, fd);
if (LOG.isDebugEnabled()) {
LOG.debug("The blocking tree is ");
blockingTree.print(2);
Expand Down
Loading
Loading