Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export / Import #29

Merged
merged 7 commits into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@


- Return RID on upload
- Support KB import/export


## 1.1.12 (2023-09-21)
Expand Down
1 change: 1 addition & 0 deletions docs/01-README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ First steps should be:
- [Upload conversation](07-conversation.md)
- [Search](06-search.md)
- [Extract information from a file](05-extract.md)
- [Import/export knowledge boxes](08-import-export.md)
- Detect Entities
- [Get embedding from text](05-extract.md)
- Get answer from a context
52 changes: 52 additions & 0 deletions docs/08-import-export.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Import/export

## Export a kb

```sh
nuclia kbs my-source-kb default
nuclia kb exports start --path=/some/path/foobar

> Export is ready to be downloaded.
> Downloading export 8be5339818f443f8b6c0afc143d2fe02 to /some/path/foobar: 7.13MB [00:05, 1.36MB/s]
```

## Import it to another kb

```sh
nuclia kbs my-dst-kb default
nuclia kb imports start --path=/some/path/foobar

> Uploading from /some/path/foobar to import: 7.13MB [00:01, 3.77MB/s]
> import_id='317e6816a661450e91ba192afad96b99'
```

The returned `import_id` can be used to check the status of the import:

```sh
nuclia kb imports status --import_id=317e6816a661450e91ba192afad96b99

> status=<Status.FINISHED: 'finished'>
```

Alternately, you can start import and use `--sync`, so the command waits for it to finish:

```sh
nuclia kb imports start --path=/some/path/foobar --sync
```

## Using the SDK

```python
from nuclia import sdk

exports = sdk.NucliaExports()
export_id = exports.start().export_id
assert exports.status(export_id=export_id).value == "finished"
exports.download(export_id=export_id, path="/some/path/kb.export")

imports = sdk.NucliaImports()
import_id = imports.start(path="/some/path/kb.export").import_id
assert imports.status(import_id=import_id).status == "finished"

imports.start(path="/some/path/kb.export", sync=True)
```
4 changes: 2 additions & 2 deletions nuclia/sdk/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import requests
from prompt_toolkit import prompt

from nuclia import BASE, get_global_url
from nuclia import BASE, BASE_DOMAIN, get_global_url
from nuclia.cli.utils import yes_no
from nuclia.config import Account, Config, KnowledgeBox, Zone
from nuclia.exceptions import NeedUserToken, UserTokenExpired
Expand Down Expand Up @@ -283,7 +283,7 @@ def kbs(self, account: str):
region = {zone.id: zone.slug for zone in zones}
for kb in kbs:
zone = region[kb["zone"]]
url = f"https://{zone}.nuclia.cloud/api/v1/kb/{kb['id']}"
url = f"https://{zone}.{BASE_DOMAIN}/api/v1/kb/{kb['id']}"
kb_obj = KnowledgeBox(
url=url, id=kb["id"], title=kb["title"], account=account, region=zone
)
Expand Down
113 changes: 113 additions & 0 deletions nuclia/sdk/export_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import time
from functools import partial
from typing import Optional

from nucliadb_models.export_import import (
CreateExportResponse,
CreateImportResponse,
Status,
)
from tqdm import tqdm

from nuclia.decorators import kb
from nuclia.lib.kb import NucliaDBClient

MB = 1024 * 1024
CHUNK_SIZE = 5 * MB


class NucliaExports:
"""
Manage Knowledge Box exports.

"""

@kb
def start(
self, *args, path: Optional[str] = None, **kwargs
) -> Optional[CreateExportResponse]:
ndb: NucliaDBClient = kwargs["ndb"]
resp = ndb.ndb.start_export(kbid=ndb.kbid)
if path is None:
return resp
self.download(export_id=resp.export_id, path=path, **kwargs)
return None

@kb
def download(self, *, export_id: str, path: str, **kwargs) -> None:
ndb: NucliaDBClient = kwargs["ndb"]
wait_for_finished(ndb, "export", export_id)
print(f"Export is ready to be downloaded.")
iterator = ndb.ndb.download_export(kbid=ndb.kbid, export_id=export_id)
with open(path, "wb") as f:
with tqdm(
desc=f"Downloading export {export_id} to {path}",
unit="B",
unit_scale=True,
unit_divisor=1024,
miniters=1,
) as pbar:
for chunk in iterator(chunk_size=CHUNK_SIZE):
pbar.update(len(chunk))
f.write(chunk)


class NucliaImports:
"""
Manage Knowledge Box imports.

"""

@kb
def start(
self, *, path: str, sync: bool = False, **kwargs
) -> Optional[CreateImportResponse]:
ndb: NucliaDBClient = kwargs["ndb"]

def iterator(path: str):
with tqdm(
desc=f"Uploading from {path} to import",
unit="B",
unit_scale=True,
unit_divisor=1024,
miniters=1,
) as pbar:
with open(path, "rb") as f:
while True:
chunk = f.read(CHUNK_SIZE)
if not chunk:
break
pbar.update(len(chunk))
yield chunk

response = ndb.ndb.start_import(kbid=ndb.kbid, content=iterator(path))
if not sync:
return response
else:
wait_for_finished(ndb, "import", response.import_id)
print(f"Import finished!")
return None

@kb
def status(self, *, import_id: str, **kwargs) -> Status:
ndb: NucliaDBClient = kwargs["ndb"]
return ndb.ndb.import_status(kbid=ndb.kbid, import_id=import_id)


def wait_for_finished(ndb: NucliaDBClient, type: str, id: str):
if type not in ("export", "import"):
raise ValueError(f"Unknown type {type}")
if type == "export":
get_status = partial(ndb.ndb.export_status, kbid=ndb.kbid, export_id=id)
else:
get_status = partial(ndb.ndb.import_status, kbid=ndb.kbid, import_id=id)
status = get_status().status
pbar = tqdm(
unit="it", desc=f"Waiting for {type} {id} to finish", miniters=1, delay=1
)
while status != Status.FINISHED:
assert status != Status.ERRORED, f"{type} failed"
assert status != Status.CANCELLED, f"{type} cancelled"
pbar.update()
time.sleep(1)
status = get_status()
5 changes: 4 additions & 1 deletion nuclia/sdk/kb.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
from typing import List, Optional
from warnings import warn

from nucliadb_models.configuration import KBConfiguration
from nucliadb_models.labels import KnowledgeBoxLabels, Label, LabelSet, LabelSetKind
from nucliadb_models.resource import Resource, ResourceList

from nuclia.data import get_auth
from nuclia.decorators import kb, pretty
from nuclia.lib.kb import NucliaDBClient
from nuclia.sdk.auth import NucliaAuth
from nuclia.sdk.export_import import NucliaExports, NucliaImports
from nuclia.sdk.logger import logger
from nuclia.sdk.resource import NucliaResource
from nuclia.sdk.search import NucliaSearch
from nuclia.sdk.upload import NucliaUpload
from nucliadb_models.configuration import KBConfiguration


class NucliaKB:
Expand All @@ -25,6 +26,8 @@ def __init__(self):
self.upload = NucliaUpload()
self.search = NucliaSearch()
self.resource = NucliaResource()
self.exports = NucliaExports()
self.imports = NucliaImports()
lferran marked this conversation as resolved.
Show resolved Hide resolved

@kb
def list(self, *, interactive: bool = True, **kwargs) -> Optional[ResourceList]:
Expand Down
4 changes: 2 additions & 2 deletions nuclia/sdk/upload.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
from __future__ import annotations

import hashlib
import mimetypes
import os
import hashlib
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional, Tuple
from uuid import uuid4
from nuclia.lib.kb import NucliaDBClient

import requests
from nucliadb_models.text import TextFormat
Expand All @@ -18,6 +17,7 @@
from nuclia.data import get_auth
from nuclia.decorators import kb
from nuclia.lib.conversations import Conversation
from nuclia.lib.kb import NucliaDBClient
from nuclia.sdk.auth import NucliaAuth
from nuclia.sdk.logger import logger
from nuclia.sdk.resource import RESOURCE_ATTRIBUTES, NucliaResource
Expand Down
34 changes: 34 additions & 0 deletions nuclia/tests/test_kb/test_export_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import tempfile
import time

import pytest

from nuclia.sdk.export_import import NucliaExports, NucliaImports


@pytest.mark.skip(reason="To avoid duplicating data in the test KB")
def test_sync(testing_config):
exports = NucliaExports()
imports = NucliaImports()

with tempfile.TemporaryDirectory() as tempdir:
path = f"{tempdir}/kb.export"
exports.start(path=path)
imports.start(path=path, sync=True)


@pytest.mark.skip(reason="To avoid duplicating data in the test KB")
def test_manual(testing_config):
exports = NucliaExports()
imports = NucliaImports()
with tempfile.TemporaryDirectory() as tempdir:
path = f"{tempdir}/kb.export"
resp = exports.start(path=path)
exports.download(resp.export_id, path=path)
imports.start(path=path)
while True:
status = imports.status(resp.import_id).status
if status == "finished":
break
assert status != "failed"
time.sleep(5)
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ pydantic<2.0.0
pyyaml>=5.4
requests
prompt_toolkit
nucliadb_sdk>=2.23.0
nucliadb_models>=2.23.0
nucliadb_sdk>=2.24.1
nucliadb_models>=2.24.1
tqdm
3 changes: 2 additions & 1 deletion test-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ types-requests==2.31.0.2
types-tqdm==4.66.0.2
pytest-asyncio==0.21.1
nucliadb>=2.23.0
nats-py==2.2.0
nats-py==2.2.0
pytest_lazy_fixtures==1.0.1