Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatic columns selector for joins function #168

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion src/tablecloth/api.clj
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,8 @@


(defn anti-join
([ds-left ds-right]
(tablecloth.api.join-concat-ds/anti-join ds-left ds-right))
([ds-left ds-right columns-selector]
(tablecloth.api.join-concat-ds/anti-join ds-left ds-right columns-selector))
([ds-left ds-right columns-selector options]
Expand Down Expand Up @@ -315,6 +317,8 @@


(defn asof-join
([ds-left ds-right]
(tablecloth.api.join-concat-ds/asof-join ds-left ds-right))
([ds-left ds-right columns-selector]
(tablecloth.api.join-concat-ds/asof-join ds-left ds-right columns-selector))
([ds-left ds-right columns-selector options]
Expand Down Expand Up @@ -1180,6 +1184,8 @@ column-names function returns names according to columns-selector

(defn full-join
"Join keeping all rows"
([ds-left ds-right]
(tablecloth.api.join-concat-ds/full-join ds-left ds-right))
([ds-left ds-right columns-selector]
(tablecloth.api.join-concat-ds/full-join ds-left ds-right columns-selector))
([ds-left ds-right columns-selector options]
Expand Down Expand Up @@ -1337,6 +1343,8 @@ column-names function returns names according to columns-selector


(defn inner-join
([ds-left ds-right]
(tablecloth.api.join-concat-ds/inner-join ds-left ds-right))
([ds-left ds-right columns-selector]
(tablecloth.api.join-concat-ds/inner-join ds-left ds-right columns-selector))
([ds-left ds-right columns-selector options]
Expand All @@ -1351,7 +1359,7 @@ column-names function returns names according to columns-selector


(defn join-columns
"Join clumns of dataset. Accepts:
"Join columns of dataset. Accepts:
dataset
column selector (as in select-columns)
options
Expand Down Expand Up @@ -1393,6 +1401,8 @@ column-names function returns names according to columns-selector


(defn left-join
([ds-left ds-right]
(tablecloth.api.join-concat-ds/left-join ds-left ds-right))
([ds-left ds-right columns-selector]
(tablecloth.api.join-concat-ds/left-join ds-left ds-right columns-selector))
([ds-left ds-right columns-selector options]
Expand Down Expand Up @@ -2078,6 +2088,8 @@ column-names function returns names according to columns-selector


(defn right-join
([ds-left ds-right]
(tablecloth.api.join-concat-ds/right-join ds-left ds-right))
([ds-left ds-right columns-selector]
(tablecloth.api.join-concat-ds/right-join ds-left ds-right columns-selector))
([ds-left ds-right columns-selector options]
Expand Down Expand Up @@ -2187,6 +2199,8 @@ column-names function returns names according to columns-selector


(defn semi-join
([ds-left ds-right]
(tablecloth.api.join-concat-ds/semi-join ds-left ds-right))
([ds-left ds-right columns-selector]
(tablecloth.api.join-concat-ds/semi-join ds-left ds-right columns-selector))
([ds-left ds-right columns-selector options]
Expand Down
48 changes: 47 additions & 1 deletion src/tablecloth/api/join_concat_ds.clj
Original file line number Diff line number Diff line change
Expand Up @@ -53,22 +53,51 @@
(impl [(first cols-left) (first cols-right)] ds-left ds-right (or options {}))
(multi-join impl ds-left ds-right cols-left cols-right options))))

(defn- automatic-columns-selector [ds-left ds-right]
(let [cols-l (set (column-names ds-left))
cols-r (set (column-names ds-right))]
(vec (s/intersection cols-l cols-r))) )

(defn left-join
"Applies the left-join operation on the datasets. If no automatic selector is
provided, common columns between two datasets are used as column-selectors. Options is a map with following keys -
- `hashing` - Hashing function to use (default identity)
- `drop-join-column?` - Remove joined columns (default true)"
([ds-left ds-right]
(left-join ds-left ds-right (automatic-columns-selector ds-left ds-right)))
([ds-left ds-right columns-selector] (left-join ds-left ds-right columns-selector nil))
([ds-left ds-right columns-selector options]
(apply-join j/left-join ds-left ds-right columns-selector options)))

(defn right-join
"Applies the right-join operation on the datasets. If no automatic selector is
provided, common columns between two datasets are used as column-selectors. Options is a map with following keys -
- `hashing` - Hashing function to use (default identity)
- `drop-join-column?` - Remove joined columns (default true)"
([ds-left ds-right]
(right-join ds-left ds-right (automatic-columns-selector ds-left ds-right)))
([ds-left ds-right columns-selector] (right-join ds-left ds-right columns-selector nil))
([ds-left ds-right columns-selector options]
(apply-join j/right-join ds-left ds-right columns-selector options)))

(defn inner-join
"Applies the inner-join operation on the datasets. If no automatic selector is
provided, common columns between two datasets are used as column-selectors. Options is a map with following keys -
- `hashing` - Hashing function to use (default identity)
- `drop-join-column?` - Remove joined columns (default true)"
([ds-left ds-right]
(inner-join ds-left ds-right (automatic-columns-selector ds-left ds-right)))
([ds-left ds-right columns-selector] (inner-join ds-left ds-right columns-selector nil))
([ds-left ds-right columns-selector options]
(apply-join j/inner-join ds-left ds-right columns-selector options)))

(defn asof-join
"Applies the asof-join operation on the datasets. If no automatic selector is
provided, common columns between two datasets are used as column-selectors. Options is a map with following keys -
- `hashing` - Hashing function to use (default identity)
- `drop-join-column?` - Remove joined columns (default true)"
([ds-left ds-right]
(asof-join ds-left ds-right (automatic-columns-selector ds-left ds-right)))
([ds-left ds-right columns-selector] (asof-join ds-left ds-right columns-selector nil))
([ds-left ds-right columns-selector options]
(apply-join j/left-join-asof ds-left ds-right columns-selector options)))
Expand All @@ -81,7 +110,12 @@
(j/pd-merge ds-left ds-right (assoc options :left-on left :right-on right :how :outer))))

(defn full-join
"Join keeping all rows"
"Join keeping all rows. If no automatic selector is
provided, common columns between two datasets are used as column-selectors. Options is a map with following keys -
- `hashing` - Hashing function to use (default identity)
- `drop-join-column?` - Remove joined columns (default true)"
([ds-left ds-right]
(full-join ds-left ds-right (automatic-columns-selector ds-left ds-right)))
([ds-left ds-right columns-selector] (full-join ds-left ds-right columns-selector nil))
([ds-left ds-right columns-selector options]
(apply-join full-join-wrapper ds-left ds-right columns-selector options)))
Expand All @@ -95,12 +129,24 @@
(distinct)))

(defn semi-join
"Applies the semi-join operation on the datasets. If no automatic selector is
provided, common columns between two datasets are used as column-selectors. Options is a map with following keys -
- `hashing` - Hashing function to use (default identity)
- `drop-join-column?` - Remove joined columns (default true)"
([ds-left ds-right]
(semi-join ds-left ds-right (automatic-columns-selector ds-left ds-right)))
([ds-left ds-right columns-selector] (semi-join ds-left ds-right columns-selector nil))
([ds-left ds-right columns-selector options]
(->> (semi-anti-join-indexes ds-left ds-right columns-selector options)
(select-rows ds-left))))

(defn anti-join
"Applies the anti-join operation on the datasets. If no automatic selector is
provided, common columns between two datasets are used as column-selectors. Options is a map with following keys -
- `hashing` - Hashing function to use (default identity)
- `drop-join-column?` - Remove joined columns (default true)"
([ds-left ds-right]
(anti-join ds-left ds-right (automatic-columns-selector ds-left ds-right)))
([ds-left ds-right columns-selector] (anti-join ds-left ds-right columns-selector nil))
([ds-left ds-right columns-selector options]
(->> (semi-anti-join-indexes ds-left ds-right columns-selector options)
Expand Down
2 changes: 1 addition & 1 deletion src/tablecloth/api/join_separate.clj
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
(if drop-columns? (drop-columns result col-names) result)))

(defn join-columns
"Join clumns of dataset. Accepts:
"Join columns of dataset. Accepts:
dataset
column selector (as in select-columns)
options
Expand Down
52 changes: 52 additions & 0 deletions test/tablecloth/api/join_concat_ds_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,22 @@
[:i :y])
(api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"}])

(fact "int-string join with automatic column selector"
(-> (api/left-join (-> (api/dataset [{:i "foo" :y 2022}]))
(-> (api/dataset [{:i "foo" :y 2022 :s "2022"}
{:i "foo" :y 2023 :s "2023"}])))
(api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"}]
(-> (api/left-join (-> (api/dataset [{:i "foo" :y 2022}])
(api/convert-types {:y :int16}))
(-> (api/dataset [{:i "foo" :y 2022 :s "2022"}
{:i "foo" :y 2023 :s "2023"}])))
(api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"}])

(fact "int-string join with automatic column selector - when there are no common columns"
(-> (api/left-join (-> (api/dataset [{:i "foo" :x 2022}]))
(-> (api/dataset [{:y 2022 :z "bar"}])))
(api/rows :as-maps)) => [{:i "foo", :x 2022, :y 2022 :z "bar"}])

(fact "left join on shorts packed into the vector"
(-> (api/left-join (-> (api/dataset [{:iy ["foo" (short 2022)]}]))
(-> (api/dataset [{:iy ["foo" (long 2022)] :s "2022"}
Expand Down Expand Up @@ -111,3 +127,39 @@
(api/dataset [{:k "baz"}])
[:k])
(api/rows :as-maps)) => [{:k "baz", :v "\"baz\""} {:k "baz", :v "\"baz\""}])

(fact "right join with automatic column selector"
(-> (api/right-join (api/dataset [{:i "foo" :y 2022}])
(api/dataset [{:i "foo" :y 2022 :s "2022"}
{:i "foo" :y 2023 :s "2023"}]))
(api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"}
{:i nil, :y nil, :right.i "foo", :right.y 2023, :s "2023"}])

(fact "inner join with automatic column selector"
(-> (api/inner-join (api/dataset [{:i "foo" :y 2022}])
(api/dataset [{:i "foo" :y 2022 :s "2022"}
{:i "foo" :y 2023 :s "2023"}]))
(api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"}])

(fact "full join with automatic column selector"
(-> (api/full-join (api/dataset [{:i "foo" :y 2022}
{:i "bar" :y 2021 }])
(api/dataset [{:i "foo" :y 2022 :s "2022"}
{:i "foo" :y 2023 :s "2023"}]))
(api/rows :as-maps)) => [{:i "foo", :y 2022, :right.i "foo", :right.y 2022, :s "2022"}
{:i "bar", :y 2021, :right.i nil, :right.y nil, :s nil}
{:i nil, :y nil, :right.i "foo", :right.y 2023, :s "2023"}])

(fact "anti join with automatic column selector"
(-> (api/anti-join (api/dataset [{:i "foo" :y 2022}
{:i "bar" :y 2021 }])
(api/dataset [{:i "foo" :y 2022 :s "2022"}
{:i "foo" :y 2023 :s "2023"}]))
(api/rows :as-maps)) => [{:i "bar", :y 2021}])

(fact "semi join with automatic column selector"
(-> (api/semi-join (api/dataset [{:i "foo" :y 2022}
{:i "bar" :y 2021 }])
(api/dataset [{:i "foo" :y 2022 :s "2022"}
{:i "foo" :y 2023 :s "2023"}]))
(api/rows :as-maps)) => [{:i "foo", :y 2022}])