keboola · kudj · Dec 17, 2024 · Dec 17, 2024
diff --git a/README.md b/README.md
@@ -15,9 +15,9 @@ The query output is exported using the name of the input table.
 In simple mode, the parameter **queries** is an array of queries to be executed. Each query has own input, query, and output, and is isolated.
 Each query can use the following parameters:
 
-- **input:** A string specifying the name or pattern, or an object containing the following parameters:
+- **input:** A string specifying the name of table or an object containing the following parameters:
   - **input_pattern** (required): The name of the table or [glob pattern](https://duckdb.org/docs/data/multiple_files/overview#glob-syntax)
-  - **duckdb_destination:** The name of the table in DuckDB
+  - **duckdb_destination:** (required when glob pattern used in the previous param) The name of the table in DuckDB
   - **dtypes_mode:** all_varchar (default) / auto_detect / from_manifest
     - all_varchar: Treats all columns as text
     - auto_detect: Automatically infers data types
@@ -28,6 +28,7 @@ Each query can use the following parameters:
   - **column_names:** A list of column names
   - **date_format:** A string specifying the date format
   - **timestamp_format:** A string specifying the timestamp format
+  - **add_filename_column:** A boolean value indicating whether the filename should be added as a column
 - **query** (required): The query to be executed
 - **output:** A string specifying the output table name or an object containing the following parameters:
   - **kbc_destination:** The name of the output table
@@ -38,40 +39,60 @@ Each query can use the following parameters:
 
 ```
 {
-    "queries":[
-      {
-        "input": "sales.csv",
-        "query" : "SELECT sales_representative, SUM(turnover) AS total_turnover FROM sales GROUP BY sales_representative;",
-      }]
+    "before": [],
+    "after": [
+        {
+            "definition": {
+                "component": "keboola.processor-duckdb"
+            },
+            "parameters": {
+                "mode": "simple",
+                "queries": [
+                    {
+                        "input": "sales.csv",
+                        "query": "SELECT sales_representative, SUM(turnover) AS total_turnover FROM sales.csv GROUP BY sales_representative"
+                    }
+                ]
+            }
+        }
+    ]
 }
 ```
 
 ```
 {
-    "mode": "simple",
-    "queries":[
-      {
-        "input": {
-          "input_pattern": "/data/in/files/*.csv",
-          "duckdb_destination": "sales",
-          "dtypes_mode": "auto_detect",
-          "skip_lines": 1,
-          "delimiter": ",",
-          "quotechar": "\"",
-          "column_names": ["id", "name", "turnover", "sales_representative", "date"],
-          "date_format": "YYYY-MM-DD",
-          "timestamp_format": "YYYY-MM-DD HH:MM:SS"
-        },
-        "query" : "SELECT *, 'csv' as new_column FROM products AS p LEFT JOIN '/data/in/files/categories.parquet' AS cat on p.category = cat.id ORDER BY p.id",
-        "output": {
-          "kbc_destination": "out-sales",
-          "primary_key": ["id"],
-          "incremental": true
+    "before": [],
+    "after": [
+        {
+            "definition": {
+                "component": "keboola.processor-duckdb"
+            },
+            "parameters": {
+                "mode": "simple",
+                "queries":[
+                  {
+                    "input": {
+                      "input_pattern": "/data/in/tables/*.csv",
+                      "duckdb_destination": "products",
+                      "dtypes_mode": "auto_detect",
+                      "skip_lines": 1,
+                      "delimiter": ",",
+                      "quotechar": "\"",
+                      "column_names": ["id", "name", "category_id"],
+                      "date_format": "YYYY-MM-DD",
+                      "timestamp_format": "YYYY-MM-DD HH:MM:SS",
+                      "add_filename_column": true
+                    },
+                    "query" : "SELECT p.*, cat.name as category_name FROM products AS p LEFT JOIN '/data/in/files/categories.parquet' AS cat on p.category_id = cat.id ORDER BY p.id",
+                    "output": {
+                      "kbc_destination": "out-products",
+                      "primary_key": ["id"],
+                      "incremental": true
+                    }
+                  }]
+            }
         }
-      },{
-        "input": "categories",
-        "query" : "SELECT * FROM categories"
-      }]
+    ]
 }
 ```
 
@@ -81,9 +102,9 @@ In advanced mode, the [relations](https://duckdb.org/docs/api/python/relational_
 Then, all defined queries are processed. Finally, output tables specified in out_tables are exported to Keboola Storage.
 
 Parameters:
- - **input:** An array of all input tables from Keboola, defined either as a string containing the name/pattern, or as an object containing the following parameters:
+ - **input:** An array of all input tables from Keboola, defined either as a string containing the name, or as an object containing the following parameters:
     - **input_pattern** (required): The name of the table or [glob pattern](https://duckdb.org/docs/data/multiple_files/overview#glob-syntax)
-    - **duckdb_destination:** The name of the table in DuckDB
+    - **duckdb_destination:** (required when glob pattern used in the previous param) The name of the table in DuckDB
     - **dtypes_mode:** all_varchar (default) / auto_detect / from_manifest 
       - all_varchar: Treats all columns as text
       - auto_detect: Automatically infers data types
@@ -94,6 +115,7 @@ Parameters:
     - **column_names:** A list of column names
     - **date_format:** A string specifying the date format
     - **timestamp_format:** A string specifying the timestamp format
+    - **add_filename_column:** A boolean value indicating whether the filename should be added as a column
 - **queries:** A list of SQL queries to be executed
 - **output:** An array of all output tables, defined as a string specifying the output table name or as an object containing the following parameters:
   - **kbc_destination:** The name of the output table
@@ -102,36 +124,100 @@ Parameters:
 
 #### Example configuration
 
-
+Example configuration loading two tables, running queries (including query for storing parquet to /data/out/files folder, and finally exporting table to Keboola Storage.
 ```
 {
-    "mode": "advanced",
-    "input": ["sliced", "days.csv"],
-    "queries":["CREATE view final AS SELECT * FROM sliced LEFT JOIN days.csv USING (id) ORDER BY id"],
-    "output": [
-                {"duckdb_source": "final",
-                 "kbc_destination": "final.csv"
-                 }
-               ]
+    "before": [],
+    "after": [
+        {
+            "definition": {
+                "component": "keboola.processor-duckdb"
+            },
+            "parameters": {
+                "mode": "advanced",
+                "input": [
+                    "sales1.csv",
+                    "sales2.csv"
+                ],
+                "queries": [
+                    "CREATE view sales_all AS SELECT * FROM sales1.csv UNION ALL SELECT * FROM sales2.csv",
+                    "CREATE view sales_first AS SELECT * FROM sales1.csv LIMIT 1",
+                    "COPY sales_all TO '/data/out/files/sales_all.parquet' (FORMAT PARQUET)"
+                ],
+                "output": [
+                    "sales_first"
+                ]
+            }
+        }
+    ]
 }
 ```
 
+This example demonstrates the full configuration for loading data from storage using a glob pattern. In the queries, we are joining a table created from multiple files with a Parquet file from storage. For the output, we provide a detailed configuration to define the name, primary key (PK), and incremental load.
 ```
 {
-    "mode": "advanced",
-    "input": ["products"],
-    "queries":["CREATE view out AS SELECT * FROM products AS p LEFT JOIN '/data/in/files/categories.parquet' AS cat on p.category = cat.id ORDER BY p.id;",
-               "CREATE view out2 AS SELECT * FROM products WHERE discount = TRUE;"],
-    "output": ["out", {"source":"out2", "destination":"out.bucket.out2.csv", "primary_key": ["id"], "incremental":  true}],
+    "before": [],
+    "after": [
+        {
+            "definition": {
+                "component": "keboola.processor-duckdb"
+            },
+            "parameters": {
+                "mode": "advanced",
+                "input": [{
+                  "input_pattern": "/data/in/tables/*.csv",
+                  "duckdb_destination": "products",
+                  "dtypes_mode": "auto_detect",
+                  "skip_lines": 1,
+                  "delimiter": ",",
+                  "quotechar": "\"",
+                  "column_names": [
+                    "id",
+                    "name",
+                    "category_id"
+                  ],
+                  "date_format": "YYYY-MM-DD",
+                  "timestamp_format": "YYYY-MM-DD HH:MM:SS",
+                  "add_filename_column": true
+                }],
+                "queries": [
+                  "CREATE TABLE category AS SELECT * FROM '/data/in/files/categories.parquet'",
+                  "CREATE VIEW out AS SELECT p.*, category.name as category_name FROM products AS p LEFT JOIN category on p.category_id = category.id ORDER BY p.id"
+                ],
+                "output": [
+                  {
+                    "duckdb_source": "out",
+                    "kbc_destination": "out",
+                    "primary_key": [
+                      "id"
+                    ],
+                    "incremental": true
+                  },
+                  "category"
+                ]
+              }
+        }
+    ]
 }
 ```
 
-Load file from the URL and save it as a table
+Load file from the URL and save it as a table. You can also use a [DuckDB extensions](https://duckdb.org/docs/extensions/overview) to load data from different sources.
+Other common usecase when we don't need to define input tables, is when we have for file extractor downloading files (Parquet, JSON or EXCEL) and we want to query them and save result to KBC storage as table. 
 ```
 {
-    "mode": "advanced",
-    "queries":["CREATE view cars AS SELECT * FROM 'https://github.com/keboola/developers-docs/raw/3f1e8a4331638a2300b29e63f797a1d52d64929e/integrate/variables/countries.csv'"],
-    "output": ["cars"]
+    "before": [],
+    "after": [
+        {
+            "definition": {
+                "component": "keboola.processor-duckdb"
+            },
+            "parameters": {
+                "mode": "advanced",
+                "queries":["CREATE view cars AS SELECT * FROM 'https://github.com/keboola/developers-docs/raw/3f1e8a4331638a2300b29e63f797a1d52d64929e/integrate/variables/countries.csv'"],
+                "output": ["cars"]
+            }
+        }
+    ]
 }
 ```
 

diff --git a/tests/functional/advanced-in-parquet-out-csv/expected/data/out/files/categories.parquet b/tests/functional/advanced-in-parquet-out-csv/expected/data/out/files/categories.parquet
diff --git a/tests/functional/advanced-in-parquet-out-csv/expected/data/out/files/categories2.parquet b/tests/functional/advanced-in-parquet-out-csv/expected/data/out/files/categories2.parquet
diff --git a/tests/functional/advanced-in-parquet-out-csv/expected/data/out/tables/final.csv b/tests/functional/advanced-in-parquet-out-csv/expected/data/out/tables/final.csv
@@ -0,0 +1,7 @@
+id,name
+"1","fruit"
+"2","vegetable"
+"3","beverages"
+"4","chocolate"
+"5","beer"
+"6","wine"
diff --git a/tests/functional/advanced-in-parquet-out-csv/expected/data/out/tables/final.csv.manifest b/tests/functional/advanced-in-parquet-out-csv/expected/data/out/tables/final.csv.manifest
@@ -0,0 +1 @@
+{"incremental": false, "write_always": false, "delimiter": ",", "enclosure": "\"", "column_metadata": {"id": [{"key": "KBC.datatype.basetype", "value": "INTEGER"}, {"key": "KBC.datatype.nullable", "value": false}], "name": [{"key": "KBC.datatype.basetype", "value": "STRING"}, {"key": "KBC.datatype.nullable", "value": false}]}, "columns": ["id", "name"]}
diff --git a/tests/functional/advanced-in-parquet-out-csv/source/data/config.json b/tests/functional/advanced-in-parquet-out-csv/source/data/config.json
@@ -0,0 +1,12 @@
+{
+  "parameters": {
+                "mode": "advanced",
+                "queries":["CREATE view final AS SELECT * FROM '/code/tests/functional/advanced-in-parquet-out-csv/source/data/in/files/*.parquet'"],
+                "output": ["final"]
+            },
+  "image_parameters": {},
+  "action": "run",
+  "shared_code_row_ids": [],
+  "storage": {},
+  "authorization": {}
+}
diff --git a/tests/functional/advanced-in-parquet-out-csv/source/data/in/files/categories.parquet b/tests/functional/advanced-in-parquet-out-csv/source/data/in/files/categories.parquet
diff --git a/tests/functional/advanced-in-parquet-out-csv/source/data/in/files/categories2.parquet b/tests/functional/advanced-in-parquet-out-csv/source/data/in/files/categories2.parquet
diff --git a/tests/functional/advanced-in-parquet-out-csv/source/data/in/tables/.gitkeep b/tests/functional/advanced-in-parquet-out-csv/source/data/in/tables/.gitkeep
diff --git a/tests/functional/advanced-in-parquet-out-csv/source/data/out/files/.gitkeep b/tests/functional/advanced-in-parquet-out-csv/source/data/out/files/.gitkeep
diff --git a/tests/functional/advanced-in-parquet-out-csv/source/data/out/tables/.gitkeep b/tests/functional/advanced-in-parquet-out-csv/source/data/out/tables/.gitkeep
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"incremental": false, "write_always": false, "delimiter": ",", "enclosure": "\"", "column_metadata": {"id": [{"key": "KBC.datatype.basetype", "value": "INTEGER"}, {"key": "KBC.datatype.nullable", "value": false}], "name": [{"key": "KBC.datatype.basetype", "value": "STRING"}, {"key": "KBC.datatype.nullable", "value": false}]}, "columns": ["id", "name"]}