-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add
migr8
program from previous repository [1]
Also, add a testcontainers wrapper. [1] https://github.com/crate/mongodb-cratedb-migration-tool
- Loading branch information
Showing
20 changed files
with
1,218 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
--- | ||
name: Tests | ||
name: "Tests: Common" | ||
|
||
on: | ||
pull_request: ~ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
--- | ||
name: "Tests: MongoDB" | ||
|
||
on: | ||
pull_request: | ||
branches: ~ | ||
paths: | ||
- '.github/workflows/mongodb.yml' | ||
- 'cratedb_toolkit/io/mongodb/**' | ||
- 'pyproject.toml' | ||
push: | ||
branches: [ main ] | ||
paths: | ||
- '.github/workflows/mongodb.yml' | ||
- 'cratedb_toolkit/io/mongodb/**' | ||
- 'pyproject.toml' | ||
|
||
# Allow job to be triggered manually. | ||
workflow_dispatch: | ||
|
||
# Run job each night after CrateDB nightly has been published. | ||
schedule: | ||
- cron: '0 3 * * *' | ||
|
||
# Cancel in-progress jobs when pushing to the same branch. | ||
concurrency: | ||
cancel-in-progress: true | ||
group: ${{ github.workflow }}-${{ github.ref }} | ||
|
||
jobs: | ||
|
||
tests: | ||
|
||
runs-on: ${{ matrix.os }} | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
os: ["ubuntu-latest"] | ||
python-version: ["3.8", "3.12"] | ||
mongodb-version: ["2", "3", "4", "5", "6", "7"] | ||
|
||
env: | ||
OS: ${{ matrix.os }} | ||
PYTHON: ${{ matrix.python-version }} | ||
MONGODB_VERSION: ${{ matrix.mongodb-version }} | ||
# Do not tear down Testcontainers | ||
TC_KEEPALIVE: true | ||
|
||
name: Python ${{ matrix.python-version }}, MongoDB ${{ matrix.mongodb-version }} on OS ${{ matrix.os }} | ||
steps: | ||
|
||
- name: Acquire sources | ||
uses: actions/checkout@v4 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
architecture: x64 | ||
cache: 'pip' | ||
cache-dependency-path: 'pyproject.toml' | ||
|
||
- name: Setup project | ||
run: | | ||
# `setuptools 0.64.0` adds support for editable install hooks (PEP 660). | ||
# https://github.com/pypa/setuptools/blob/main/CHANGES.rst#v6400 | ||
pip install "setuptools>=64" --upgrade | ||
# Install package in editable mode. | ||
pip install --use-pep517 --prefer-binary --editable=.[io,test,develop] | ||
- name: Run linter and software tests | ||
run: | | ||
poe check | ||
- name: Upload coverage to Codecov | ||
uses: codecov/codecov-action@v3 | ||
with: | ||
files: ./coverage.xml | ||
flags: mongodb | ||
env_vars: OS,PYTHON | ||
name: codecov-umbrella | ||
fail_ci_if_error: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
try: | ||
from importlib.metadata import PackageNotFoundError, version | ||
except (ImportError, ModuleNotFoundError): # pragma:nocover | ||
from importlib_metadata import PackageNotFoundError, version # type: ignore[assignment,no-redef,unused-ignore] | ||
|
||
__appname__ = "cratedb-toolkit" | ||
|
||
try: | ||
__version__ = version(__appname__) | ||
except PackageNotFoundError: # pragma: no cover | ||
__version__ = "unknown" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
# MongoDB → CrateDB Migration Tool | ||
|
||
A utility program, called `migr8`, supporting data migrations | ||
between MongoDB and CrateDB. | ||
|
||
|
||
## About | ||
|
||
### Details | ||
|
||
This tool iterates over one or multiple MongoDB collections, | ||
and iteratively builds up a description of the schema of those | ||
collections. | ||
|
||
In a second step, this description can be used to create a CrateDB table | ||
schema, which will attempt to determine a best-fit table definition for | ||
that schema. | ||
|
||
As such, this means the tool works best on collections of similarly | ||
structured and typed data. | ||
|
||
### Supported MongoDB versions | ||
|
||
The application supports the following versions of MongoDB. | ||
|
||
[![Supported MongoDB versions](https://img.shields.io/badge/MongoDB-2.x%20--%207.x-blue.svg)](https://github.com/mongodb/mongo) | ||
|
||
If you need support for MongoDB 2.x, you will need to downgrade the `pymongo` | ||
client driver library to version 3, like `pip install 'pymongo<4'`. | ||
|
||
### Installation | ||
|
||
Use `pip` to install the package from PyPI. | ||
```shell | ||
pip install --upgrade 'cratedb-toolkit[io]' | ||
``` | ||
|
||
To verify if the installation worked, invoke: | ||
```shell | ||
migr8 --version | ||
migr8 --help | ||
``` | ||
|
||
|
||
## Usage | ||
|
||
The program `migr8` offers three subcommands (`extract`, `translate`, `export`), | ||
to conclude data transfers from MongoDB to CrateDB. Please read this section | ||
carefully to learn how they can be used successfully. | ||
|
||
### Schema Extraction | ||
|
||
To extract a description of the schema of a collection, use the | ||
`extract` subcommand. For example: | ||
|
||
migr8 extract --host localhost --port 27017 --database test_db | ||
|
||
After connecting to the designated MongoDB server, it will | ||
look at the collections within that database, and will prompt you which | ||
collections to *exclude* from analysis. | ||
|
||
You can then do a *full* or *partial* scan of the collection. | ||
|
||
A partial scan will only look at the first entry in a collection, and | ||
thus may produce an ambiguous schema definition. It is still useful if you | ||
already know the collection is systematically and regularly structured. | ||
|
||
A full scan will iterate over the entire collection and build up the | ||
schema description. Cancelling the scan will cause the tool to output | ||
the schema description it has built up thus far. | ||
|
||
For example, scanning a collection of payloads including a `ts` field, | ||
a `sensor` field, and a `payload` object, may yield this outcome: | ||
|
||
```json | ||
{ | ||
"test": { | ||
"count": 100000, | ||
"document": { | ||
"_id": { | ||
"count": 100000, | ||
"types": { | ||
"OID": { | ||
"count": 100000 | ||
} | ||
} | ||
}, | ||
"ts": { | ||
"count": 100000, | ||
"types": { | ||
"DATETIME": { | ||
"count": 100000 | ||
} | ||
} | ||
}, | ||
"sensor": { | ||
"count": 100000, | ||
"types": { | ||
"STRING": { | ||
"count": 100000 | ||
} | ||
} | ||
}, | ||
"payload": { | ||
"count": 100000, | ||
"types": { | ||
"OBJECT": { | ||
"count": 100000, | ||
"document": { | ||
"temp": { | ||
"count": 100000, | ||
"types": { | ||
"FLOAT": { | ||
"count": 1 | ||
}, | ||
"INTEGER": { | ||
"count": 99999 | ||
} | ||
} | ||
}, | ||
"humidity": { | ||
"count": 100000, | ||
"types": { | ||
"FLOAT": { | ||
"count": 1 | ||
}, | ||
"INTEGER": { | ||
"count": 99999 | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
``` | ||
|
||
This description indicates that the data is well-structured, and has | ||
mostly consistent data-types. | ||
|
||
|
||
### Schema Translation | ||
|
||
Once a schema description has been extracted, it can be translated | ||
into a CrateDB schema definition using the `translate` subcommand: | ||
|
||
migr8 translate -i mongodb_schema.json | ||
|
||
This will attempt to translate the description into a best-fit CrateDB | ||
table definition. Where datatypes are ambiguous, it will *choose the | ||
most common datatype*. For example, the previous schema definition would | ||
be translated into this SQL DDL statement: | ||
```sql | ||
CREATE TABLE IF NOT EXISTS "doc"."test" ( | ||
"ts" TIMESTAMP WITH TIME ZONE, | ||
"sensor" TEXT, | ||
"payload" OBJECT (STRICT) AS ( | ||
-- ⬇️ Types: FLOAT: 0.0%, INTEGER: 100.0% | ||
"temp" INTEGER, | ||
-- ⬇️ Types: FLOAT: 0.0%, INTEGER: 100.0% | ||
"humidity" INTEGER | ||
) | ||
); | ||
``` | ||
|
||
|
||
### MongoDB Collection Export | ||
|
||
To export a MongoDB collection to a JSON stream, use the `export` | ||
subcommand: | ||
|
||
migr8 export --host localhost --port 27017 --database test_db --collection test | ||
|
||
This will convert the collection's records into JSON, and output the JSON to stdout. | ||
For example, to redirect the output to a file, run: | ||
|
||
migr8 export --host localhost --port 27017 --database test_db --collection test > test.json | ||
|
||
Alternatively, use [cr8] to directly write the MongoDB collection into a CrateDB table: | ||
|
||
migr8 export --host localhost --port 27017 --database test_db --collection test | \ | ||
cr8 insert-json --hosts localhost:4200 --table test | ||
|
||
|
||
[cr8]: https://github.com/mfussenegger/cr8 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
import argparse | ||
import json | ||
|
||
import pymongo | ||
import rich | ||
from bson.raw_bson import RawBSONDocument | ||
|
||
from cratedb_toolkit import __version__ | ||
from cratedb_toolkit.io.mongodb.core import extract, translate | ||
|
||
from .export import export | ||
|
||
|
||
def extract_parser(subargs): | ||
parser = subargs.add_parser("extract", help="Extract a schema from a MongoDB database") | ||
parser.add_argument("--host", default="localhost", help="MongoDB host") | ||
parser.add_argument("--port", default=27017, help="MongoDB port") | ||
parser.add_argument("--database", required=True, help="MongoDB database") | ||
parser.add_argument("--collection", help="MongoDB collection to create a schema for") | ||
parser.add_argument( | ||
"--scan", | ||
choices=["full", "partial"], | ||
help="Whether to fully scan the MongoDB collections or only partially.", | ||
) | ||
parser.add_argument("-o", "--out", default="mongodb_schema.json") | ||
|
||
|
||
def translate_parser(subargs): | ||
parser = subargs.add_parser( | ||
"translate", | ||
help="Translate a MongoDB schema definition to a CrateDB table schema", | ||
) | ||
parser.add_argument("-i", "--infile", help="The JSON file to read the MongoDB schema from") | ||
|
||
|
||
def export_parser(subargs): | ||
parser = subargs.add_parser("export", help="Export a MongoDB collection as plain JSON") | ||
parser.add_argument("--collection", required=True) | ||
parser.add_argument("--host", default="localhost", help="MongoDB host") | ||
parser.add_argument("--port", default=27017, help="MongoDB port") | ||
parser.add_argument("--database", required=True, help="MongoDB database") | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"-V", | ||
"--version", | ||
action="version", | ||
help="print package version of pyproject_fmt", | ||
version=f"%(prog)s ({__version__})", | ||
) | ||
subparsers = parser.add_subparsers(dest="command") | ||
extract_parser(subparsers) | ||
translate_parser(subparsers) | ||
export_parser(subparsers) | ||
return parser.parse_args() | ||
|
||
|
||
def extract_to_file(args): | ||
""" | ||
Extract a schema or set of schemas from MongoDB collections into a JSON file. | ||
""" | ||
|
||
schema = extract(args) | ||
rich.print(f"\nWriting resulting schema to {args.out}...") | ||
with open(args.out, "w") as out: | ||
json.dump(schema, out, indent=4) | ||
rich.print("[green bold]Done![/green bold]") | ||
|
||
|
||
def translate_from_file(args): | ||
""" | ||
Read in a JSON file and extract the schema from it. | ||
""" | ||
|
||
with open(args.infile) as f: | ||
schema = json.load(f) | ||
translate(schema) | ||
|
||
|
||
def export_to_stdout(args): | ||
client = pymongo.MongoClient(args.host, int(args.port), document_class=RawBSONDocument) | ||
db = client[args.database] | ||
export(db[args.collection]) | ||
|
||
|
||
def main(): | ||
args = get_args() | ||
if args.command == "extract": | ||
extract_to_file(args) | ||
elif args.command == "translate": | ||
translate_from_file(args) | ||
elif args.command == "export": | ||
export_to_stdout(args) | ||
Oops, something went wrong.