Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

feat: support data generation #715

Merged
merged 24 commits into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Add support for data generation jobs. ([#715](https://github.com/jina-ai/finetuner/pull/715))

### Removed

### Changed
Expand Down
102 changes: 93 additions & 9 deletions finetuner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from finetuner.constants import (
DEFAULT_FINETUNER_HOST,
DEFAULT_HUBBLE_REGISTRY,
EMBEDDING,
HOST,
HUBBLE_REGISTRY,
)
Expand Down Expand Up @@ -65,9 +66,16 @@ def _build_name_stub_map() -> Dict[str, model_stub.ModelStubType]:
return rv


def list_models() -> List[str]:
"""List available models."""
return [name for name in list_model_classes()]
def list_models(model_type: str = EMBEDDING) -> List[str]:
LMMilliken marked this conversation as resolved.
Show resolved Hide resolved
"""List available models.

:param type: The type of backbone model, one of 'embedding', 'cross_ecoding' or
'relation_mining'. 'embedding' by default.

"""
return [
stub.display_name for stub in list_model_classes(model_type=model_type).values()
]


def list_model_options() -> Dict[str, List[Dict[str, Any]]]:
Expand All @@ -91,16 +99,19 @@ def list_model_options() -> Dict[str, List[Dict[str, Any]]]:
}


def describe_models(task: Optional[str] = None) -> None:
def describe_models(task: Optional[str] = None, model_type: str = EMBEDDING) -> None:
"""Print model information, such as name, task, output dimension, architecture
and description as a table.

:param task: The task for the backbone model, one of `text-to-text`,
`text-to-image`, `image-to-image`. If not provided, will print all backbone
models.
:param type: The type of backbone model, one of 'embedding', 'cross_ecoding' or
'relation_mining'. 'embedding' by default, the `task` parameter will be ignored
if this is set to anything else.

"""
print_model_table(model, task=task)
print_model_table(model, task=task, model_type=model_type)


@login_required
Expand Down Expand Up @@ -137,8 +148,8 @@ def fit(
loss_optimizer: Optional[str] = None,
loss_optimizer_options: Optional[Dict[str, Any]] = None,
) -> Run:
"""Create a Finetuner :class:`Run`, calling this function will submit a fine-tuning
job to the Jina AI Cloud.
"""Create a Finetuner training :class:`Run`, calling this function will submit a
fine-tuning job to the Jina AI Cloud.

:param model: The name of model to be fine-tuned. Run `finetuner.list_models()` or
`finetuner.describe_models()` to see the available model names.
Expand Down Expand Up @@ -240,7 +251,7 @@ def fit(
extremely slow and inefficient.
"""

return ft.create_run(
return ft.create_training_run(
model=model,
train_data=train_data,
eval_data=eval_data,
Expand Down Expand Up @@ -275,9 +286,82 @@ def fit(
)


# `create_run` and `fit` do the same
@login_required
def synthesize(
query_data: Union[str, List[str], DocumentArray],
corpus_data: Union[str, List[str], DocumentArray],
mining_models: Union[str, List[str]],
cross_encoder_model: str,
num_relations: int,
max_num_docs: Optional[int] = None,
run_name: Optional[str] = None,
description: Optional[str] = None,
experiment_name: Optional[str] = None,
device: str = 'cuda',
num_workers: int = 4,
csv_options: Optional[CSVOptions] = None,
public: bool = False,
) -> Run:
"""Create a Finetuner generation :class:`Run`, calling this function will submit a
data generation job to the Jina AI Cloud.

:param query_data: Either a :class:`DocumentArray` for example queries, name of a
`DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as
a list of strings or a path to a CSV file.
:param corpus_data: Either a :class:`DocumentArray` for corpus data, a name of a
`DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as a
list of strings or a path to a CSV file.
:param mining_models: The name or a list of names of models to be used during
relation mining. Run `finetuner.list_models(model_type='relation_mining')` or
`finetuner.describe_models(model_type='relation_mining')` to see the
available model names.
:param cross_encoder_model: The name of the model to be used as the cross-encoder.
Run `finetuner.list_models(model_type='cross_encoding')` or
`finetuner.describe_models(model_type='cross_encoding')` to see the
available model names.
:param num_relations: The number of relations to mine per query.
:param max_num_docs: The maximum number of documents to consider.
:param run_name: Name of the run.
:param: description: Run Description.
:param experiment_name: Name of the experiment.
:param device: Whether to use the CPU, if set to `cuda`, a Nvidia GPU will be used.
otherwise use `cpu` to run a cpu job.
:param num_workers: Number of CPU workers. If `cpu: False` this is the number of
workers used by the dataloader.
:param csv_options: A :class:`CSVOptions` object containing options used for
reading in training and evaluation data from a CSV file, if they are
provided as such.
:param public: A boolean value indicates if the artifact is public. It should be
set to `True` if you would like to share your synthesized data with others.

.. note::
Unless necessary, please stick with `device="cuda"`, `cpu` training could be
extremely slow and inefficient.
"""
return ft.create_synthesis_run(
query_data=query_data,
corpus_data=corpus_data,
mining_models=mining_models,
cross_encoder_model=cross_encoder_model,
num_relations=num_relations,
max_num_docs=max_num_docs,
run_name=run_name,
description=description,
experiment_name=experiment_name,
device=device,
num_workers=num_workers,
csv_options=csv_options,
public=public,
)


# `create_run`, `create_training_run` and `fit` do the same
create_training_run = fit
create_run = fit

# `create_synthesis_run` and `synthesize` do the same
create_synthesis_run = synthesize


def get_run(run_name: str, experiment_name: Optional[str] = None) -> Run:
"""Get a :class:`Run` by its name and (optional) :class:`Experiment` name.
Expand Down
3 changes: 3 additions & 0 deletions finetuner/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
POST,
RUNS,
STATUS,
TASK,
)

_finetuner_core_version = pkg_resources.get_distribution('finetuner-stubs').version
Expand Down Expand Up @@ -267,6 +268,7 @@ def create_run(
experiment_name: str,
run_name: str,
run_config: dict,
task: str,
device: str,
cpus: int,
gpus: int,
Expand All @@ -291,6 +293,7 @@ def create_run(
json_data={
NAME: run_name,
CONFIG: run_config,
TASK: task,
FINETUNER_VERSION: _finetuner_core_version,
DEVICE: device,
CPUS: cpus,
Expand Down
5 changes: 3 additions & 2 deletions finetuner/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
from rich.console import Console
from rich.table import Table

from finetuner.constants import EMBEDDING
from finetuner.model import list_model_classes

console = Console()


def print_model_table(model, task: Optional[str] = None):
def print_model_table(model, task: Optional[str] = None, model_type: str = EMBEDDING):
"""Prints a table of model descriptions.

:param model: Module with model definitions
Expand All @@ -24,7 +25,7 @@ def print_model_table(model, task: Optional[str] = None):
for column in header:
table.add_column(column, justify='right', style='cyan', no_wrap=False)

for _, _model_class in list_model_classes().items():
for _, _model_class in list_model_classes(model_type=model_type).items():
if _model_class.display_name not in model_display_names:
row = model.get_row(_model_class)
if task and row[1] != task:
Expand Down
15 changes: 15 additions & 0 deletions finetuner/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,18 @@
PUBLIC = 'public'
NUM_ITEMS_PER_CLASS = 'num_items_per_class'
VAL_SPLIT = 'val_split'
TASK = 'task'
TRAINING = 'training'
GENERATION = 'generation'
# Generation job
RAW_DATA_CONFIG = 'data'
RELATION_MINING = 'relation_mining'
CROSS_ENCODER = 'cross_encoder'
QUERIES = 'queries'
CORPUS = 'corpus'
MODELS = 'models'
NUM_RELATIONS = 'num_relations'
MAX_NUM_DOCS = 'max_num_docs'
# Stub types
EMBEDDING = 'embedding'
CROSS_ENCODING = 'cross_encoding'
38 changes: 35 additions & 3 deletions finetuner/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,31 @@ def parse(self):
yield Document(chunks=[doc1, doc2], tags={DEFAULT_TAG_SCORE_KEY: col3})


class DataGenerationParser(_CSVParser):
"""
CSV has either one column or one row, each item in the CSV represents a single
document so the structure of the CSV file is not important.
LMMilliken marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(
self,
file: Union[str, TextIO, StringIO],
task: str,
options: Optional[CSVOptions] = None,
):
super().__init__(file, task, options)

def parse(self):
with self._file_ctx as fp:
lines = csv.reader(fp, dialect=self._options.dialect)

for columns in _subsample(
lines, self._options.size, self._options.sampling_rate
):
for column in columns:
yield Document(text=column)


class CSVContext:
"""
A CSV context switch class with conditions to parse CSVs into DocumentArray.
Expand All @@ -232,12 +257,15 @@ class CSVContext:

def __init__(
self,
model: str,
model: Optional[str] = None,
task: Optional[str] = None,
LMMilliken marked this conversation as resolved.
Show resolved Hide resolved
options: Optional[CSVOptions] = None,
):
self._model = model
self._options = options or CSVOptions()
if model == 'mlp':
if not model:
self._task = 'generation'
elif model == 'mlp':
self._task = 'image-to-image'
else:
model_stub = get_stub(
Expand All @@ -248,7 +276,11 @@ def __init__(
self._task = model_stub.task

def _get_csv_parser(self, data: Union[str, TextIO]):
if self._options.is_labeled:
if self._task == 'generation':
return DataGenerationParser(
file=data, task=self._task, options=self._options
)
elif self._options.is_labeled:
return LabeledCSVParser(file=data, task=self._task, options=self._options)
else:
_, num_columns = get_csv_file_dialect_columns(
Expand Down
Loading