From cb0bfad2bf6fad2dd205c963f48163996ed220a3 Mon Sep 17 00:00:00 2001 From: Prakash Balodi Date: Thu, 22 Aug 2024 01:35:18 +0530 Subject: [PATCH 1/9] Use automatic column selector in left-join Find out common columns between the 2 datasets and use the common columns in them as column-selector This change introduces a new arity in left-join function --- src/tablecloth/api/join_concat_ds.clj | 4 ++++ test/tablecloth/api/join_concat_ds_test.clj | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/tablecloth/api/join_concat_ds.clj b/src/tablecloth/api/join_concat_ds.clj index ce20745..6be46c2 100644 --- a/src/tablecloth/api/join_concat_ds.clj +++ b/src/tablecloth/api/join_concat_ds.clj @@ -54,6 +54,10 @@ (multi-join impl ds-left ds-right cols-left cols-right options)))) (defn left-join + ([ds-left ds-right] + (let [cols-l (set (column-names ds-left :all)) + cols-r (set (column-names ds-right :all))] + (left-join ds-left ds-right (vec (s/intersection cols-l cols-r))))) ([ds-left ds-right columns-selector] (left-join ds-left ds-right columns-selector nil)) ([ds-left ds-right columns-selector options] (apply-join j/left-join ds-left ds-right columns-selector options))) diff --git a/test/tablecloth/api/join_concat_ds_test.clj b/test/tablecloth/api/join_concat_ds_test.clj index 0ac5928..e26d3e2 100644 --- a/test/tablecloth/api/join_concat_ds_test.clj +++ b/test/tablecloth/api/join_concat_ds_test.clj @@ -84,6 +84,17 @@ [:i :y]) (api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"}]) +(fact "eraderna int-string join with automatic column selector" + (-> (api/left-join (-> (api/dataset [{:i "foo" :y 2022}])) + (-> (api/dataset [{:i "foo" :y 2022 :s "2022"} + {:i "foo" :y 2023 :s "2023"}]))) + (api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"}] + (-> (api/left-join (-> (api/dataset [{:i "foo" :y 2022}]) + (api/convert-types {:y :int16})) + (-> (api/dataset [{:i "foo" :y 2022 :s "2022"} + {:i "foo" :y 2023 :s "2023"}]))) + (api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"}]) + (fact "left join on shorts packed into the vector" (-> (api/left-join (-> (api/dataset [{:iy ["foo" (short 2022)]}])) (-> (api/dataset [{:iy ["foo" (long 2022)] :s "2022"} From a1ca0ee1b7a94c65551dd0e4b2f02892a6511dcf Mon Sep 17 00:00:00 2001 From: Prakash Balodi Date: Sat, 24 Aug 2024 01:54:27 +0530 Subject: [PATCH 2/9] Add automatic selector functionality for all join functions --- src/tablecloth/api/join_concat_ds.clj | 21 ++++++++++++++++++--- test/tablecloth/api/join_concat_ds_test.clj | 2 +- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/tablecloth/api/join_concat_ds.clj b/src/tablecloth/api/join_concat_ds.clj index 6be46c2..d6c5162 100644 --- a/src/tablecloth/api/join_concat_ds.clj +++ b/src/tablecloth/api/join_concat_ds.clj @@ -53,26 +53,35 @@ (impl [(first cols-left) (first cols-right)] ds-left ds-right (or options {})) (multi-join impl ds-left ds-right cols-left cols-right options)))) -(defn left-join - ([ds-left ds-right] +(defn- automatic-columns-selector [ds-left ds-right] (let [cols-l (set (column-names ds-left :all)) cols-r (set (column-names ds-right :all))] - (left-join ds-left ds-right (vec (s/intersection cols-l cols-r))))) + (vec (s/intersection cols-l cols-r))) ) + +(defn left-join + ([ds-left ds-right] + (left-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (left-join ds-left ds-right columns-selector nil)) ([ds-left ds-right columns-selector options] (apply-join j/left-join ds-left ds-right columns-selector options))) (defn right-join + ([ds-left ds-right] + (right-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (right-join ds-left ds-right columns-selector nil)) ([ds-left ds-right columns-selector options] (apply-join j/right-join ds-left ds-right columns-selector options))) (defn inner-join + ([ds-left ds-right] + (inner-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (inner-join ds-left ds-right columns-selector nil)) ([ds-left ds-right columns-selector options] (apply-join j/inner-join ds-left ds-right columns-selector options))) (defn asof-join + ([ds-left ds-right] + (asof-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (asof-join ds-left ds-right columns-selector nil)) ([ds-left ds-right columns-selector options] (apply-join j/left-join-asof ds-left ds-right columns-selector options))) @@ -86,6 +95,8 @@ (defn full-join "Join keeping all rows" + ([ds-left ds-right] + (full-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (full-join ds-left ds-right columns-selector nil)) ([ds-left ds-right columns-selector options] (apply-join full-join-wrapper ds-left ds-right columns-selector options))) @@ -99,12 +110,16 @@ (distinct))) (defn semi-join + ([ds-left ds-right] + (semi-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (semi-join ds-left ds-right columns-selector nil)) ([ds-left ds-right columns-selector options] (->> (semi-anti-join-indexes ds-left ds-right columns-selector options) (select-rows ds-left)))) (defn anti-join + ([ds-left ds-right] + (anti-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (anti-join ds-left ds-right columns-selector nil)) ([ds-left ds-right columns-selector options] (->> (semi-anti-join-indexes ds-left ds-right columns-selector options) diff --git a/test/tablecloth/api/join_concat_ds_test.clj b/test/tablecloth/api/join_concat_ds_test.clj index e26d3e2..7311e2f 100644 --- a/test/tablecloth/api/join_concat_ds_test.clj +++ b/test/tablecloth/api/join_concat_ds_test.clj @@ -84,7 +84,7 @@ [:i :y]) (api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"}]) -(fact "eraderna int-string join with automatic column selector" +(fact "int-string join with automatic column selector" (-> (api/left-join (-> (api/dataset [{:i "foo" :y 2022}])) (-> (api/dataset [{:i "foo" :y 2022 :s "2022"} {:i "foo" :y 2023 :s "2023"}]))) From 59604c4256fec811b5c98a913ffd00669bd8cfde Mon Sep 17 00:00:00 2001 From: Prakash Balodi Date: Sun, 1 Sep 2024 02:17:30 +0530 Subject: [PATCH 3/9] Update api ns with new function signature --- src/tablecloth/api.clj | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/tablecloth/api.clj b/src/tablecloth/api.clj index da8e6a0..ffa6474 100644 --- a/src/tablecloth/api.clj +++ b/src/tablecloth/api.clj @@ -263,6 +263,8 @@ (defn anti-join + ([ds-left ds-right] + (tablecloth.api.join-concat-ds/anti-join ds-left ds-right)) ([ds-left ds-right columns-selector] (tablecloth.api.join-concat-ds/anti-join ds-left ds-right columns-selector)) ([ds-left ds-right columns-selector options] @@ -315,6 +317,8 @@ (defn asof-join + ([ds-left ds-right] + (tablecloth.api.join-concat-ds/asof-join ds-left ds-right)) ([ds-left ds-right columns-selector] (tablecloth.api.join-concat-ds/asof-join ds-left ds-right columns-selector)) ([ds-left ds-right columns-selector options] @@ -1180,6 +1184,8 @@ column-names function returns names according to columns-selector (defn full-join "Join keeping all rows" + ([ds-left ds-right] + (tablecloth.api.join-concat-ds/full-join ds-left ds-right)) ([ds-left ds-right columns-selector] (tablecloth.api.join-concat-ds/full-join ds-left ds-right columns-selector)) ([ds-left ds-right columns-selector options] @@ -1337,6 +1343,8 @@ column-names function returns names according to columns-selector (defn inner-join + ([ds-left ds-right] + (tablecloth.api.join-concat-ds/inner-join ds-left ds-right)) ([ds-left ds-right columns-selector] (tablecloth.api.join-concat-ds/inner-join ds-left ds-right columns-selector)) ([ds-left ds-right columns-selector options] @@ -1393,6 +1401,8 @@ column-names function returns names according to columns-selector (defn left-join + ([ds-left ds-right] + (tablecloth.api.join-concat-ds/left-join ds-left ds-right)) ([ds-left ds-right columns-selector] (tablecloth.api.join-concat-ds/left-join ds-left ds-right columns-selector)) ([ds-left ds-right columns-selector options] @@ -2078,6 +2088,8 @@ column-names function returns names according to columns-selector (defn right-join + ([ds-left ds-right] + (tablecloth.api.join-concat-ds/right-join ds-left ds-right)) ([ds-left ds-right columns-selector] (tablecloth.api.join-concat-ds/right-join ds-left ds-right columns-selector)) ([ds-left ds-right columns-selector options] @@ -2187,6 +2199,8 @@ column-names function returns names according to columns-selector (defn semi-join + ([ds-left ds-right] + (tablecloth.api.join-concat-ds/semi-join ds-left ds-right)) ([ds-left ds-right columns-selector] (tablecloth.api.join-concat-ds/semi-join ds-left ds-right columns-selector)) ([ds-left ds-right columns-selector options] From 81f627b1d5ac0cddc5cb98194f115e371c8a1ab1 Mon Sep 17 00:00:00 2001 From: Prakash Balodi Date: Mon, 2 Sep 2024 12:41:31 +0530 Subject: [PATCH 4/9] Add another test --- test/tablecloth/api/join_concat_ds_test.clj | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/tablecloth/api/join_concat_ds_test.clj b/test/tablecloth/api/join_concat_ds_test.clj index 7311e2f..3155690 100644 --- a/test/tablecloth/api/join_concat_ds_test.clj +++ b/test/tablecloth/api/join_concat_ds_test.clj @@ -95,6 +95,11 @@ {:i "foo" :y 2023 :s "2023"}]))) (api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"}]) +(fact "int-string join with automatic column selector - when there are no common columns" + (-> (api/left-join (-> (api/dataset [{:i "foo" :x 2022}])) + (-> (api/dataset [{:y 2022 :z "bar"}]))) + (api/rows :as-maps)) => [{:i "foo", :x 2022, :y 2022 :z "bar"}]) + (fact "left join on shorts packed into the vector" (-> (api/left-join (-> (api/dataset [{:iy ["foo" (short 2022)]}])) (-> (api/dataset [{:iy ["foo" (long 2022)] :s "2022"} From bb1c4637f9a223f83fa377b33739b1ebb4238931 Mon Sep 17 00:00:00 2001 From: Prakash Balodi Date: Tue, 10 Sep 2024 00:42:22 +0530 Subject: [PATCH 5/9] remove :all from column-selector --- src/tablecloth/api/join_concat_ds.clj | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tablecloth/api/join_concat_ds.clj b/src/tablecloth/api/join_concat_ds.clj index d6c5162..3a58b5f 100644 --- a/src/tablecloth/api/join_concat_ds.clj +++ b/src/tablecloth/api/join_concat_ds.clj @@ -54,8 +54,8 @@ (multi-join impl ds-left ds-right cols-left cols-right options)))) (defn- automatic-columns-selector [ds-left ds-right] - (let [cols-l (set (column-names ds-left :all)) - cols-r (set (column-names ds-right :all))] + (let [cols-l (set (column-names ds-left)) + cols-r (set (column-names ds-right))] (vec (s/intersection cols-l cols-r))) ) (defn left-join From d42c6003e06763d214231e74cf3c527b33ca547d Mon Sep 17 00:00:00 2001 From: Prakash Balodi Date: Mon, 16 Sep 2024 18:50:53 +0530 Subject: [PATCH 6/9] Fix typo --- src/tablecloth/api.clj | 2 +- src/tablecloth/api/join_separate.clj | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tablecloth/api.clj b/src/tablecloth/api.clj index ffa6474..2d0b3f6 100644 --- a/src/tablecloth/api.clj +++ b/src/tablecloth/api.clj @@ -1359,7 +1359,7 @@ column-names function returns names according to columns-selector (defn join-columns - "Join clumns of dataset. Accepts: + "Join columns of dataset. Accepts: dataset column selector (as in select-columns) options diff --git a/src/tablecloth/api/join_separate.clj b/src/tablecloth/api/join_separate.clj index 861aae4..e85d8f7 100644 --- a/src/tablecloth/api/join_separate.clj +++ b/src/tablecloth/api/join_separate.clj @@ -18,7 +18,7 @@ (if drop-columns? (drop-columns result col-names) result))) (defn join-columns - "Join clumns of dataset. Accepts: + "Join columns of dataset. Accepts: dataset column selector (as in select-columns) options From 6702403565d2d7fc8d926f0e7c6e258c31046e87 Mon Sep 17 00:00:00 2001 From: Prakash Balodi Date: Mon, 16 Sep 2024 18:53:18 +0530 Subject: [PATCH 7/9] Add tests for join functions --- test/tablecloth/api/join_concat_ds_test.clj | 36 +++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/test/tablecloth/api/join_concat_ds_test.clj b/test/tablecloth/api/join_concat_ds_test.clj index 3155690..cdf66ea 100644 --- a/test/tablecloth/api/join_concat_ds_test.clj +++ b/test/tablecloth/api/join_concat_ds_test.clj @@ -127,3 +127,39 @@ (api/dataset [{:k "baz"}]) [:k]) (api/rows :as-maps)) => [{:k "baz", :v "\"baz\""} {:k "baz", :v "\"baz\""}]) + +(fact "right join with automatic column selector" + (-> (api/right-join (api/dataset [{:i "foo" :y 2022}]) + (api/dataset [{:i "foo" :y 2022 :s "2022"} + {:i "foo" :y 2023 :s "2023"}])) + (api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"} + {:i nil, :y nil, :right.i "foo", :right.y 2023, :s "2023"}]) + +(fact "inner join with automatic column selector" + (-> (api/inner-join (api/dataset [{:i "foo" :y 2022}]) + (api/dataset [{:i "foo" :y 2022 :s "2022"} + {:i "foo" :y 2023 :s "2023"}])) + (api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"}]) + +(fact "full join with automatic column selector" + (-> (api/full-join (api/dataset [{:i "foo" :y 2022} + {:i "bar" :y 2021 }]) + (api/dataset [{:i "foo" :y 2022 :s "2022"} + {:i "foo" :y 2023 :s "2023"}])) + (api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"} + {:i "bar", :y 2021, :right.i nil, :right.y nil, :s nil} + {:i nil, :y nil, :right.i "foo", :right.y 2023, :s "2023"}]) + +(fact "anti join with automatic column selector" + (-> (api/anti-join (api/dataset [{:i "foo" :y 2022} + {:i "bar" :y 2021 }]) + (api/dataset [{:i "foo" :y 2022 :s "2022"} + {:i "foo" :y 2023 :s "2023"}])) + (api/rows :as-maps)) => [{:i "bar", :y 2021}]) + +(fact "semi join with automatic column selector" + (-> (api/semi-join (api/dataset [{:i "foo" :y 2022} + {:i "bar" :y 2021 }]) + (api/dataset [{:i "foo" :y 2022 :s "2022"} + {:i "foo" :y 2023 :s "2023"}])) + (api/rows :as-maps)) => [{:i "foo", :y 2022}]) From eeb0691cf0f216895fa803ac4ddada2c139aef02 Mon Sep 17 00:00:00 2001 From: Prakash Balodi Date: Tue, 17 Sep 2024 02:10:44 +0530 Subject: [PATCH 8/9] Add docstring --- src/tablecloth/api/join_concat_ds.clj | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/tablecloth/api/join_concat_ds.clj b/src/tablecloth/api/join_concat_ds.clj index 3a58b5f..8debeae 100644 --- a/src/tablecloth/api/join_concat_ds.clj +++ b/src/tablecloth/api/join_concat_ds.clj @@ -59,6 +59,10 @@ (vec (s/intersection cols-l cols-r))) ) (defn left-join + "Applies the left-join operation on the datasets. If no automatic selector is + provided, common columns between two datasets are used as column-selectors. Options is a map with following keys - + - hashing - Hashing function to use (default identity) + - drop-join-column? - Remove joined columns (default true)" ([ds-left ds-right] (left-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (left-join ds-left ds-right columns-selector nil)) From 47d8d9d2cc73f76d57ff99d7e467e8b484027fb1 Mon Sep 17 00:00:00 2001 From: Prakash Balodi Date: Thu, 19 Sep 2024 01:51:21 +0530 Subject: [PATCH 9/9] Add more doc-strings --- src/tablecloth/api/join_concat_ds.clj | 29 ++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/tablecloth/api/join_concat_ds.clj b/src/tablecloth/api/join_concat_ds.clj index 8debeae..e8ce3be 100644 --- a/src/tablecloth/api/join_concat_ds.clj +++ b/src/tablecloth/api/join_concat_ds.clj @@ -61,8 +61,8 @@ (defn left-join "Applies the left-join operation on the datasets. If no automatic selector is provided, common columns between two datasets are used as column-selectors. Options is a map with following keys - - - hashing - Hashing function to use (default identity) - - drop-join-column? - Remove joined columns (default true)" + - `hashing` - Hashing function to use (default identity) + - `drop-join-column?` - Remove joined columns (default true)" ([ds-left ds-right] (left-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (left-join ds-left ds-right columns-selector nil)) @@ -70,6 +70,10 @@ (apply-join j/left-join ds-left ds-right columns-selector options))) (defn right-join + "Applies the right-join operation on the datasets. If no automatic selector is + provided, common columns between two datasets are used as column-selectors. Options is a map with following keys - + - `hashing` - Hashing function to use (default identity) + - `drop-join-column?` - Remove joined columns (default true)" ([ds-left ds-right] (right-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (right-join ds-left ds-right columns-selector nil)) @@ -77,6 +81,10 @@ (apply-join j/right-join ds-left ds-right columns-selector options))) (defn inner-join + "Applies the inner-join operation on the datasets. If no automatic selector is + provided, common columns between two datasets are used as column-selectors. Options is a map with following keys - + - `hashing` - Hashing function to use (default identity) + - `drop-join-column?` - Remove joined columns (default true)" ([ds-left ds-right] (inner-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (inner-join ds-left ds-right columns-selector nil)) @@ -84,6 +92,10 @@ (apply-join j/inner-join ds-left ds-right columns-selector options))) (defn asof-join + "Applies the asof-join operation on the datasets. If no automatic selector is + provided, common columns between two datasets are used as column-selectors. Options is a map with following keys - + - `hashing` - Hashing function to use (default identity) + - `drop-join-column?` - Remove joined columns (default true)" ([ds-left ds-right] (asof-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (asof-join ds-left ds-right columns-selector nil)) @@ -98,7 +110,10 @@ (j/pd-merge ds-left ds-right (assoc options :left-on left :right-on right :how :outer)))) (defn full-join - "Join keeping all rows" + "Join keeping all rows. If no automatic selector is + provided, common columns between two datasets are used as column-selectors. Options is a map with following keys - + - `hashing` - Hashing function to use (default identity) + - `drop-join-column?` - Remove joined columns (default true)" ([ds-left ds-right] (full-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (full-join ds-left ds-right columns-selector nil)) @@ -114,6 +129,10 @@ (distinct))) (defn semi-join + "Applies the semi-join operation on the datasets. If no automatic selector is + provided, common columns between two datasets are used as column-selectors. Options is a map with following keys - + - `hashing` - Hashing function to use (default identity) + - `drop-join-column?` - Remove joined columns (default true)" ([ds-left ds-right] (semi-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (semi-join ds-left ds-right columns-selector nil)) @@ -122,6 +141,10 @@ (select-rows ds-left)))) (defn anti-join + "Applies the anti-join operation on the datasets. If no automatic selector is + provided, common columns between two datasets are used as column-selectors. Options is a map with following keys - + - `hashing` - Hashing function to use (default identity) + - `drop-join-column?` - Remove joined columns (default true)" ([ds-left ds-right] (anti-join ds-left ds-right (automatic-columns-selector ds-left ds-right))) ([ds-left ds-right columns-selector] (anti-join ds-left ds-right columns-selector nil))