Skip to content

Commit

Permalink
feat(ingest/cassandra): Add support for Cassandra as a source (#11822)
Browse files Browse the repository at this point in the history
  • Loading branch information
sagar-salvi-apptware authored Nov 15, 2024
1 parent becda8f commit fd2da83
Show file tree
Hide file tree
Showing 23 changed files with 6,327 additions and 96 deletions.
4 changes: 4 additions & 0 deletions datahub-web-react/src/app/ingest/source/builder/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import csvLogo from '../../../../images/csv-logo.png';
import qlikLogo from '../../../../images/qliklogo.png';
import sigmaLogo from '../../../../images/sigmalogo.png';
import sacLogo from '../../../../images/saclogo.svg';
import cassandraLogo from '../../../../images/cassandralogo.png';
import datahubLogo from '../../../../images/datahublogo.png';

export const ATHENA = 'athena';
Expand Down Expand Up @@ -129,6 +130,8 @@ export const SIGMA = 'sigma';
export const SIGMA_URN = `urn:li:dataPlatform:${SIGMA}`;
export const SAC = 'sac';
export const SAC_URN = `urn:li:dataPlatform:${SAC}`;
export const CASSANDRA = 'cassandra';
export const CASSANDRA_URN = `urn:li:dataPlatform:${CASSANDRA}`;
export const DATAHUB = 'datahub';
export const DATAHUB_GC = 'datahub-gc';
export const DATAHUB_LINEAGE_FILE = 'datahub-lineage-file';
Expand Down Expand Up @@ -175,6 +178,7 @@ export const PLATFORM_URN_TO_LOGO = {
[QLIK_SENSE_URN]: qlikLogo,
[SIGMA_URN]: sigmaLogo,
[SAC_URN]: sacLogo,
[CASSANDRA_URN]: cassandraLogo,
[DATAHUB_URN]: datahubLogo,
};

Expand Down
7 changes: 7 additions & 0 deletions datahub-web-react/src/app/ingest/source/builder/sources.json
Original file line number Diff line number Diff line change
Expand Up @@ -310,5 +310,12 @@
"description": "Import Spaces, Sources, Tables and statistics from Dremio.",
"docsUrl": "https://datahubproject.io/docs/metadata-ingestion/",
"recipe": "source:\n type: dremio\n config:\n # Coordinates\n hostname: null\n port: null\n #true if https, otherwise false\n tls: true\n\n #For cloud instance\n #is_dremio_cloud: True\n #dremio_cloud_project_id: <project_id>\n\n #Credentials with personal access token\n authentication_method: PAT\n password: pass\n\n #Or Credentials with basic auth\n #authentication_method: password\n #username: null\n #password: null\n\n stateful_ingestion:\n enabled: true"
},
{
"urn": "urn:li:dataPlatform:cassandra",
"name": "cassandra",
"displayName": "CassandraDB",
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/cassandra",
"recipe": "source:\n type: cassandra\n config:\n # Credentials for on prem cassandra\n contact_point: localhost\n port: 9042\n username: admin\n password: password\n\n # Or\n # Credentials Astra Cloud\n #cloud_config:\n # secure_connect_bundle: Path to Secure Connect Bundle (.zip)\n # token: Application Token\n\n # Optional Allow / Deny extraction of particular keyspaces.\n keyspace_pattern:\n allow: [.*]\n\n # Optional Allow / Deny extraction of particular tables.\n table_pattern:\n allow: [.*]"
}
]
Binary file added datahub-web-react/src/images/cassandralogo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
40 changes: 40 additions & 0 deletions metadata-ingestion/docs/sources/cassandra/cassandra_pre.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
### Setup

This integration pulls metadata directly from Cassandra databases, including both **DataStax Astra DB** and **Cassandra Enterprise Edition (EE)**.

You’ll need to have a Cassandra instance or an Astra DB setup with appropriate access permissions.

#### Steps to Get the Required Information

1. **Set Up User Credentials**:

- **For Astra DB**:
- Log in to your Astra DB Console.
- Navigate to **Organization Settings** > **Token Management**.
- Generate an **Application Token** with the required permissions for read access.
- Download the **Secure Connect Bundle** from the Astra DB Console.
- **For Cassandra EE**:
- Ensure you have a **username** and **password** with read access to the necessary keyspaces.

2. **Permissions**:

- The user or token must have `SELECT` permissions that allow it to:
- Access metadata in system keyspaces (e.g., `system_schema`) to retrieve information about keyspaces, tables, columns, and views.
- Perform `SELECT` operations on the data tables if data profiling is enabled.

3. **Verify Database Access**:
- For Astra DB: Ensure the **Secure Connect Bundle** is used and configured correctly.
- For Cassandra Opensource: Ensure the **contact point** and **port** are accessible.


:::caution

When enabling profiling, make sure to set a limit on the number of rows to sample. Profiling large tables without a limit may lead to excessive resource consumption and slow performance.

:::

:::note

For cloud configuration with Astra DB, it is necessary to specify the Secure Connect Bundle path in the configuration. For that reason, use the CLI to ingest metadata into DataHub.

:::
30 changes: 30 additions & 0 deletions metadata-ingestion/docs/sources/cassandra/cassandra_recipe.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
source:
type: "cassandra"
config:
# Credentials for on prem cassandra
contact_point: "localhost"
port: 9042
username: "admin"
password: "password"

# Or
# Credentials Astra Cloud
#cloud_config:
# secure_connect_bundle: "Path to Secure Connect Bundle (.zip)"
# token: "Application Token"

# Optional Allow / Deny extraction of particular keyspaces.
keyspace_pattern:
allow: [".*"]

# Optional Allow / Deny extraction of particular tables.
table_pattern:
allow: [".*"]

# Optional
profiling:
enabled: true
profile_table_level_only: true

sink:
# config sinks
9 changes: 9 additions & 0 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,13 @@
# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/release-notes.html#rn-7-14-0
# https://github.com/elastic/elasticsearch-py/issues/1639#issuecomment-883587433
"elasticsearch": {"elasticsearch==7.13.4"},
"cassandra": {
"cassandra-driver>=3.28.0",
# We were seeing an error like this `numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject`
# with numpy 2.0. This likely indicates a mismatch between scikit-learn and numpy versions.
# https://stackoverflow.com/questions/40845304/runtimewarning-numpy-dtype-size-changed-may-indicate-binary-incompatibility
"numpy<2",
},
"feast": {
"feast>=0.34.0,<1",
"flask-openid>=1.3.0",
Expand Down Expand Up @@ -660,6 +667,7 @@
"qlik-sense",
"sigma",
"sac",
"cassandra",
]
if plugin
for dependency in plugins[plugin]
Expand Down Expand Up @@ -778,6 +786,7 @@
"qlik-sense = datahub.ingestion.source.qlik_sense.qlik_sense:QlikSenseSource",
"sigma = datahub.ingestion.source.sigma.sigma:SigmaSource",
"sac = datahub.ingestion.source.sac.sac:SACSource",
"cassandra = datahub.ingestion.source.cassandra.cassandra:CassandraSource",
],
"datahub.ingestion.transformer.plugins": [
"pattern_cleanup_ownership = datahub.ingestion.transformer.pattern_cleanup_ownership:PatternCleanUpOwnership",
Expand Down
Empty file.
Loading

0 comments on commit fd2da83

Please sign in to comment.