Skip to content

Commit

Permalink
Merge pull request #574 from zinggAI/issue415
Browse files Browse the repository at this point in the history
Issue 415 source column in the input leads to an error
  • Loading branch information
sonalgoyal authored May 1, 2023
2 parents eb7ecb2 + 3bbceb5 commit d932174
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public interface ColName {
public static final String SCORE_MIN_COL = COL_PREFIX + "minScore";
public static final String SCORE_MAX_COL = COL_PREFIX + "maxScore";
public static final String SPARK_JOB_ID_COL = COL_PREFIX + "sparkJobId";
public static final String SOURCE_COL = COL_PREFIX + "source";
public static final String SOURCE_COL = COL_PREFIX + "zsource";
public static final String SCORE_KEY_COL = COL_PREFIX + "scorekey";
public static final String DENSE_COL = COL_PREFIX + "dense";
public static final String UPDATED_AT = COL_PREFIX + "updated";
Expand Down
4 changes: 2 additions & 2 deletions common/core/src/main/resources/zColumnTemplate.ftlh
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
<p>"z_score - the probability of a pair of records matching. The higher the score, the more likely they are a match."</p>
<#elseif title == "z_isMatch">
<p>z_isMatch - this is the label provided by the user.</p>
<#elseif title == "z_source">
<p>z_source - the source of data as set in the name property of the data in the Zingg configuration file.</p>
<#elseif title == "z_zsource">
<p>z_zsource - the source of data as set in the name property of the data in the Zingg configuration file.</p>
<#else>
<p>${title} - this field is internally used by Zingg.</p>
</#if>
Expand Down
2 changes: 1 addition & 1 deletion models/100/docs/model.html
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
<th class="border-right border-white" > <a href="state.html"> state </a></th>
<th class="border-right border-white" > <a href="dob.html"> dob </a></th>
<th class="border-right border-white" > <a href="ssn.html"> ssn </a></th>
<th class="border-right border-white" > <a href="z_source.html"> z_source </a></th>
<th class="border-right border-white" > <a href="z_zsource.html"> z_zsource </a></th>
<th class="border-right border-white" > <a href="z_isMatch.html"> z_isMatch </a></th>
</tr>
</thead>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<html>
<head>
<title>z_source</title>
<title>z_zsource</title>
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
</head>

Expand All @@ -9,7 +9,7 @@
<a class="navbar-brand" href="https://www.zingg.ai">
<img src="https://github.com/zinggai/zingg/raw/main/assets/zinggWhiteTransparent.png" class="d-inline-block align-top" alt="">
</a>
<h1> Field - z_source </h1>
<h1> Field - z_zsource </h1>
<a href="../model.html">
<div class="justify-content-end">Model 100</div>
</a>
Expand All @@ -22,7 +22,7 @@ <h1> Field - z_source </h1>
</tbody>
</table>
<p>
<p>z_source - the source of data as set in the name property of the data in the Zingg configuration file.</p>
<p>z_zsource - the source of data as set in the name property of the data in the Zingg configuration file.</p>
</p>
</body>
</html>
Expand Down
18 changes: 9 additions & 9 deletions perf/joinPlan.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
i== Physical Plan ==
*(11) Project [z_z_zid#597L, z_zid#367L, id#72, fname#73, lname#74, stNo#75, add1#76, add2#77, city#78, state#79, dob#80, ssn#81, z_source#82, z_id#1103, z_fname#1104, z_lname#1105, z_stNo#1106, z_add1#1107, z_add2#1108, z_city#1109, z_state#1110, z_dob#1111, z_ssn#1112, z_z_source#1113]
*(11) Project [z_z_zid#597L, z_zid#367L, id#72, fname#73, lname#74, stNo#75, add1#76, add2#77, city#78, state#79, dob#80, ssn#81, z_zsource#82, z_id#1103, z_fname#1104, z_lname#1105, z_stNo#1106, z_add1#1107, z_add2#1108, z_city#1109, z_state#1110, z_dob#1111, z_ssn#1112, z_z_zsource#1113]
+- *(11) SortMergeJoin [z_z_zid#597L], [z_z_zid#1102L], Inner
:- *(9) Sort [z_z_zid#597L ASC NULLS FIRST], false, 0
: +- Exchange hashpartitioning(z_z_zid#597L, 3000), false, [id=#472]
: +- *(8) Project [z_zid#367L, z_z_zid#597L, id#72, fname#73, lname#74, stNo#75, add1#76, add2#77, city#78, state#79, dob#80, ssn#81, z_source#82]
: +- *(8) Project [z_zid#367L, z_z_zid#597L, id#72, fname#73, lname#74, stNo#75, add1#76, add2#77, city#78, state#79, dob#80, ssn#81, z_zsource#82]
: +- *(8) SortMergeJoin [z_zid#367L], [z_zid#71L], Inner
: :- *(6) Sort [z_zid#367L ASC NULLS FIRST], false, 0
: : +- Exchange hashpartitioning(z_zid#367L, 3000), false, [id=#456]
Expand All @@ -13,16 +13,16 @@ i== Physical Plan ==
: : : +- Exchange hashpartitioning(z_hash#379, 3000), false, [id=#372]
: : : +- *(1) SerializeFromObject [validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 0, z_zid), LongType) AS z_zid#367L, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 12, z_hash), IntegerType) AS z_hash#379]
: : : +- *(1) MapElements zingg.block.Block$BlockFunction@37e7e089, obj#366: org.apache.spark.sql.Row
: : : +- *(1) DeserializeToObject createexternalrow(z_zid#71L, id#72.toString, fname#73.toString, lname#74.toString, stNo#75.toString, add1#76.toString, add2#77.toString, city#78.toString, state#79.toString, dob#80.toString, ssn#81.toString, z_source#82.toString, StructField(z_zid,LongType,false), StructField(id,StringType,true), StructField(fname,StringType,true), StructField(lname,StringType,true), StructField(stNo,StringType,true), StructField(add1,StringType,true), StructField(add2,StringType,true), StructField(city,StringType,true), StructField(state,StringType,true), StructField(dob,StringType,true), StructField(ssn,StringType,true), StructField(z_source,StringType,false)), obj#365: org.apache.spark.sql.Row
: : : +- *(1) DeserializeToObject createexternalrow(z_zid#71L, id#72.toString, fname#73.toString, lname#74.toString, stNo#75.toString, add1#76.toString, add2#77.toString, city#78.toString, state#79.toString, dob#80.toString, ssn#81.toString, z_zsource#82.toString, StructField(z_zid,LongType,false), StructField(id,StringType,true), StructField(fname,StringType,true), StructField(lname,StringType,true), StructField(stNo,StringType,true), StructField(add1,StringType,true), StructField(add2,StringType,true), StructField(city,StringType,true), StructField(state,StringType,true), StructField(dob,StringType,true), StructField(ssn,StringType,true), StructField(z_zsource,StringType,false)), obj#365: org.apache.spark.sql.Row
: : : +- Exchange hashpartitioning(z_zid#71L, 3000), false, [id=#324]
: : : +- InMemoryTableScan [z_zid#71L, id#72, fname#73, lname#74, stNo#75, add1#76, add2#77, city#78, state#79, dob#80, ssn#81, z_source#82]
: : : +- InMemoryRelation [z_zid#71L, id#72, fname#73, lname#74, stNo#75, add1#76, add2#77, city#78, state#79, dob#80, ssn#81, z_source#82], StorageLevel(memory, deserialized, 1 replicas)
: : : +- InMemoryTableScan [z_zid#71L, id#72, fname#73, lname#74, stNo#75, add1#76, add2#77, city#78, state#79, dob#80, ssn#81, z_zsource#82]
: : : +- InMemoryRelation [z_zid#71L, id#72, fname#73, lname#74, stNo#75, add1#76, add2#77, city#78, state#79, dob#80, ssn#81, z_zsource#82], StorageLevel(memory, deserialized, 1 replicas)
: : : +- Exchange RoundRobinPartitioning(3000), false, [id=#40]
: : : +- *(1) Scan ExistingRDD[z_zid#71L,id#72,fname#73,lname#74,stNo#75,add1#76,add2#77,city#78,state#79,dob#80,ssn#81,z_source#82]
: : : +- *(1) Scan ExistingRDD[z_zid#71L,id#72,fname#73,lname#74,stNo#75,add1#76,add2#77,city#78,state#79,dob#80,ssn#81,z_zsource#82]
: : +- *(4) Sort [z_hash#592 ASC NULLS FIRST], false, 0
: : +- ReusedExchange [z_zid#580L, z_hash#592], Exchange hashpartitioning(z_hash#379, 3000), false, [id=#372]
: +- *(7) Sort [z_zid#71L ASC NULLS FIRST], false, 0
: +- ReusedExchange [z_zid#71L, id#72, fname#73, lname#74, stNo#75, add1#76, add2#77, city#78, state#79, dob#80, ssn#81, z_source#82], Exchange hashpartitioning(z_zid#71L, 3000), false, [id=#324]
: +- ReusedExchange [z_zid#71L, id#72, fname#73, lname#74, stNo#75, add1#76, add2#77, city#78, state#79, dob#80, ssn#81, z_zsource#82], Exchange hashpartitioning(z_zid#71L, 3000), false, [id=#324]
+- *(10) Sort [z_z_zid#1102L ASC NULLS FIRST], false, 0
+- *(10) Project [z_zid#71L AS z_z_zid#1102L, id#72 AS z_id#1103, fname#73 AS z_fname#1104, lname#74 AS z_lname#1105, stNo#75 AS z_stNo#1106, add1#76 AS z_add1#1107, add2#77 AS z_add2#1108, city#78 AS z_city#1109, state#79 AS z_state#1110, dob#80 AS z_dob#1111, ssn#81 AS z_ssn#1112, z_source#82 AS z_z_source#1113]
+- ReusedExchange [z_zid#71L, id#72, fname#73, lname#74, stNo#75, add1#76, add2#77, city#78, state#79, dob#80, ssn#81, z_source#82], Exchange hashpartitioning(z_zid#71L, 3000), false, [id=#324]
+- *(10) Project [z_zid#71L AS z_z_zid#1102L, id#72 AS z_id#1103, fname#73 AS z_fname#1104, lname#74 AS z_lname#1105, stNo#75 AS z_stNo#1106, add1#76 AS z_add1#1107, add2#77 AS z_add2#1108, city#78 AS z_city#1109, state#79 AS z_state#1110, dob#80 AS z_dob#1111, ssn#81 AS z_ssn#1112, z_zsource#82 AS z_z_zsource#1113]
+- ReusedExchange [z_zid#71L, id#72, fname#73, lname#74, stNo#75, add1#76, add2#77, city#78, state#79, dob#80, ssn#81, z_zsource#82], Exchange hashpartitioning(z_zid#71L, 3000), false, [id=#324]
2 changes: 1 addition & 1 deletion python/phases/exportModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def main():

def export_data(labelledData, location):

baseCols = ['z_cluster', 'z_zid', 'z_prediction', 'z_score', 'z_source', 'z_isMatch']
baseCols = ['z_cluster', 'z_zid', 'z_prediction', 'z_score', 'z_zsource', 'z_isMatch']
sourceDataColumns = [c for c in labelledData.columns if c not in baseCols]
additionalTrainingColumns = ['z_cluster','z_isMatch']
trainingSampleColumns = [*additionalTrainingColumns, *sourceDataColumns]
Expand Down

0 comments on commit d932174

Please sign in to comment.