-
Notifications
You must be signed in to change notification settings - Fork 123
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Running using Databricks Connect #582 #583
Changes from 10 commits
e54046d
64f7a06
681465e
e0371aa
3eab8d7
99e9ca1
fe01f2c
6764867
4051246
7097759
f23c1b2
645b03d
48b2134
36878b7
552d091
ea7e8f4
47b493a
18f9c77
40b817a
26f1135
2df6704
c5abe56
53ae447
880b4f6
a3ddd46
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
package zingg.common.client; | ||
|
||
import java.util.List; | ||
|
||
import zingg.common.client.pipe.Pipe; | ||
|
||
public interface ITrainingHelper<S, D, R, C> { | ||
|
||
public void setMarkedRecordsStat(ZFrame<D, R, C> markedRecords); | ||
|
||
public Long getMarkedRecordsStat(ZFrame<D, R, C> markedRecords, long value); | ||
|
||
public Long getMatchedMarkedRecordsStat(ZFrame<D, R, C> markedRecords); | ||
|
||
public Long getUnmatchedMarkedRecordsStat(ZFrame<D, R, C> markedRecords); | ||
|
||
public Long getUnsureMarkedRecordsStat(ZFrame<D, R, C> markedRecords); | ||
|
||
public ZFrame<D, R, C> getClusterIdsFrame(ZFrame<D, R, C> lines); | ||
|
||
public List<R> getClusterIds(ZFrame<D, R, C> lines); | ||
|
||
public List<C> getDisplayColumns(ZFrame<D, R, C> lines, Arguments args); | ||
|
||
public ZFrame<D, R, C> getCurrentPair(ZFrame<D, R, C> lines, int index, List<R> clusterIds, ZFrame<D, R, C> clusterLines); | ||
|
||
public double getScore(ZFrame<D, R, C> currentPair); | ||
|
||
public double getPrediction(ZFrame<D, R, C> currentPair); | ||
|
||
public String getMsg1(int index, int totalPairs); | ||
|
||
public String getMsg2(double prediction, double score); | ||
|
||
public void displayRecords(ZFrame<D, R, C> records, String preMessage, String postMessage); | ||
|
||
public ZFrame<D, R, C> updateRecords(int matchValue, ZFrame<D, R, C> newRecords, ZFrame<D, R, C> updatedRecords); | ||
|
||
public void updateLabellerStat(int selected_option, int increment); | ||
|
||
public void printMarkedRecordsStat(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wont this go in the view? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. kept update in model and print in view , commit 48b2134, please review |
||
|
||
public void writeLabelledOutput(ZFrame<D, R, C> records, Arguments args) throws ZinggClientException; | ||
|
||
public void writeLabelledOutput(ZFrame<D,R,C> records, Arguments args, Pipe p) throws ZinggClientException; | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,11 +14,13 @@ | |
import zingg.common.core.util.LabelMatchType; | ||
|
||
public abstract class LabelUpdater<S,D,R,C,T> extends Labeller<S,D,R,C,T> { | ||
private static final long serialVersionUID = 1L; | ||
protected static String name = "zingg.LabelUpdater"; | ||
public static final Log LOG = LogFactory.getLog(LabelUpdater.class); | ||
|
||
public LabelUpdater() { | ||
setZinggOptions(ZinggOptions.UPDATE_LABEL); | ||
setTrainingHelper(new TrainingHelper<S,D,R,C>()); | ||
} | ||
|
||
public void execute() throws ZinggClientException { | ||
|
@@ -33,12 +35,12 @@ public void execute() throws ZinggClientException { | |
} | ||
} | ||
|
||
public void processRecordsCli(ZFrame<D,R,C> lines) throws ZinggClientException { | ||
public ZFrame<D,R,C> processRecordsCli(ZFrame<D,R,C> lines) throws ZinggClientException { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need to return a zframe here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is done so that writing of labelled output happens in a a separate method. This is needed for python api to work. |
||
LOG.info("Processing Records for CLI updateLabelling"); | ||
|
||
if (lines != null && lines.count() > 0) { | ||
getMarkedRecordsStat(lines); | ||
printMarkedRecordsStat(); | ||
getTrainingHelper().setMarkedRecordsStat(lines); | ||
getTrainingHelper().printMarkedRecordsStat(); | ||
|
||
List<C> displayCols = getDSUtil().getFieldDefColumns(lines, args, false, args.getShowConcise()); | ||
try { | ||
|
@@ -67,9 +69,9 @@ public void processRecordsCli(ZFrame<D,R,C> lines) throws ZinggClientException { | |
String matchType = LabelMatchType.get(matchFlag).msg; | ||
postMsg = String.format("\tThe above pair is labeled as %s\n", matchType); | ||
selectedOption = displayRecordsAndGetUserInput(getDSUtil().select(currentPair, displayCols), preMsg, postMsg); | ||
updateLabellerStat(selectedOption, +1); | ||
updateLabellerStat(matchFlag, -1); | ||
printMarkedRecordsStat(); | ||
getTrainingHelper().updateLabellerStat(selectedOption, +1); | ||
getTrainingHelper().updateLabellerStat(matchFlag, -1); | ||
getTrainingHelper().printMarkedRecordsStat(); | ||
if (selectedOption == 9) { | ||
LOG.info("User has quit in the middle. Updating the records."); | ||
break; | ||
|
@@ -80,15 +82,16 @@ public void processRecordsCli(ZFrame<D,R,C> lines) throws ZinggClientException { | |
updatedRecords = updatedRecords | ||
.filter(updatedRecords.notEqual(ColName.CLUSTER_COLUMN,cluster_id)); | ||
} | ||
updatedRecords = updateRecords(selectedOption, currentPair, updatedRecords); | ||
updatedRecords = getTrainingHelper().updateRecords(selectedOption, currentPair, updatedRecords); | ||
} while (selectedOption != 9); | ||
|
||
if (updatedRecords != null) { | ||
updatedRecords = updatedRecords.union(recordsToUpdate); | ||
} | ||
writeLabelledOutput(updatedRecords); | ||
getTrainingHelper().writeLabelledOutput(updatedRecords,args,getOutputPipe()); | ||
sc.close(); | ||
LOG.info("Processing finished."); | ||
return updatedRecords; | ||
} catch (Exception e) { | ||
if (LOG.isDebugEnabled()) { | ||
e.printStackTrace(); | ||
|
@@ -98,6 +101,7 @@ public void processRecordsCli(ZFrame<D,R,C> lines) throws ZinggClientException { | |
} | ||
} else { | ||
LOG.info("There is no marked record for updating. Please run findTrainingData/label jobs to generate training data."); | ||
return null; | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens if:
we have two interfaces here. TrainingDataModel and LabelDataViewHelper. The data model has methods for reading and writing training pairs, getting scores etc. The view has messages.
TrainingDataModel should extend from ZinggBase and automatically gets pipeutil and other context stuff. ZinggBase already has the methods to get stats etc..and other methods can be moved there. You can use TDM in labeller and labelupdater just like we use the trainer and matcher in trainmatcher.
TDM and LabelDataViewHelper are returned from Client methods and used in python.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
1st draft available in commit 48b2134 please review