Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

feat: support data generation #715

Merged
merged 24 commits into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Add support for data generation jobs. ([#715](https://github.com/jina-ai/finetuner/pull/715))

### Removed

### Changed
Expand Down
80 changes: 72 additions & 8 deletions finetuner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
HOST,
HUBBLE_REGISTRY,
)
from finetuner.data import CSVOptions
from finetuner.data import CSVOptions, SynthesisModels
from finetuner.run import Run
from hubble import login_required

Expand Down Expand Up @@ -67,13 +67,13 @@ def _build_name_stub_map() -> Dict[str, model_stub.ModelStubType]:

def list_models() -> List[str]:
"""List available models."""
return [name for name in list_model_classes()]
return [_model_class.display_name for _model_class in list_model_classes().values()]


def list_model_options() -> Dict[str, List[Dict[str, Any]]]:
"""List available options per model."""
return {
name: [
_model_class.display_name: [
{
'name': parameter.name,
'type': parameter.annotation,
Expand All @@ -87,7 +87,7 @@ def list_model_options() -> Dict[str, List[Dict[str, Any]]]:
).parameters.values()
if parameter.name != 'self'
]
for name, _model_class in list_model_classes().items()
for _model_class in list_model_classes().values()
}


Expand Down Expand Up @@ -137,8 +137,8 @@ def fit(
loss_optimizer: Optional[str] = None,
loss_optimizer_options: Optional[Dict[str, Any]] = None,
) -> Run:
"""Create a Finetuner :class:`Run`, calling this function will submit a fine-tuning
job to the Jina AI Cloud.
"""Create a Finetuner training :class:`Run`, calling this function will submit a
fine-tuning job to the Jina AI Cloud.

:param model: The name of model to be fine-tuned. Run `finetuner.list_models()` or
`finetuner.describe_models()` to see the available model names.
Expand Down Expand Up @@ -240,7 +240,7 @@ def fit(
extremely slow and inefficient.
"""

return ft.create_run(
return ft.create_training_run(
model=model,
train_data=train_data,
eval_data=eval_data,
Expand Down Expand Up @@ -275,9 +275,73 @@ def fit(
)


# `create_run` and `fit` do the same
@login_required
def synthesize(
query_data: Union[str, List[str], DocumentArray],
corpus_data: Union[str, List[str], DocumentArray],
models: SynthesisModels,
num_relations: int = 3,
run_name: Optional[str] = None,
description: Optional[str] = None,
experiment_name: Optional[str] = None,
device: str = 'cuda',
num_workers: int = 4,
csv_options: Optional[CSVOptions] = None,
public: bool = False,
) -> Run:
"""Create a Finetuner synthesis :class:`Run`, calling this function will submit a
data synthesis job to the Jina AI Cloud.

:param query_data: Either a :class:`DocumentArray` for example queries, name of a
`DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as
a list of strings or a path to a CSV file.
:param corpus_data: Either a :class:`DocumentArray` for corpus data, a name of a
`DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as a
list of strings or a path to a CSV file.
:param models: A :class:`SynthesisModels` object containing the names of
the models used for relation mining and cross encoding.
You can pass `finetuner.data.DATA_SYNTHESIS_EN` for the recommended models for
synthesis based on english data.
:param num_relations: The number of relations to mine per query.
:param run_name: Name of the run.
:param: description: Run Description.
:param experiment_name: Name of the experiment.
:param device: Whether to use the CPU, if set to `cuda`, a Nvidia GPU will be used.
otherwise use `cpu` to run a cpu job.
:param num_workers: Number of CPU workers. If `cpu: False` this is the number of
workers used by the dataloader.
:param csv_options: A :class:`CSVOptions` object containing options used for
reading in training and evaluation data from a CSV file, if they are
provided as such.
:param public: A boolean value indicates if the artifact is public. It should be
set to `True` if you would like to share your synthesized data with others.

.. note::
Unless necessary, please stick with `device="cuda"`, `cpu` training could be
extremely slow and inefficient.
"""
return ft.create_synthesis_run(
query_data=query_data,
corpus_data=corpus_data,
models=models,
num_relations=num_relations,
run_name=run_name,
description=description,
experiment_name=experiment_name,
device=device,
num_workers=num_workers,
csv_options=csv_options,
public=public,
)


# `create_run`, `create_training_run` and `fit` do the same
create_training_run = fit
create_run = fit

# `create_synthesis_run` and `synthesize` do the same
create_synthesis_run = synthesize


def get_run(run_name: str, experiment_name: Optional[str] = None) -> Run:
"""Get a :class:`Run` by its name and (optional) :class:`Experiment` name.
Expand Down
3 changes: 3 additions & 0 deletions finetuner/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
POST,
RUNS,
STATUS,
TASK,
)

_finetuner_core_version = pkg_resources.get_distribution('finetuner-stubs').version
Expand Down Expand Up @@ -267,6 +268,7 @@ def create_run(
experiment_name: str,
run_name: str,
run_config: dict,
task: str,
device: str,
cpus: int,
gpus: int,
Expand All @@ -291,6 +293,7 @@ def create_run(
json_data={
NAME: run_name,
CONFIG: run_config,
TASK: task,
FINETUNER_VERSION: _finetuner_core_version,
DEVICE: device,
CPUS: cpus,
Expand Down
13 changes: 13 additions & 0 deletions finetuner/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,16 @@
PUBLIC = 'public'
NUM_ITEMS_PER_CLASS = 'num_items_per_class'
VAL_SPLIT = 'val_split'
TASK = 'task'
TRAINING_TASK = 'training'
SYNTHESIS_TASK = 'generation'
# Synthesis job
RAW_DATA_CONFIG = 'data'
RELATION_MINING = 'relation_mining'
CROSS_ENCODER = 'cross_encoder'
QUERIES = 'queries'
CORPUS = 'corpus'
MODELS = 'models'
NUM_RELATIONS = 'num_relations'
TRAIN_DATA = 'train_data'
MAX_NUM_DOCS = 'max_num_docs'
57 changes: 53 additions & 4 deletions finetuner/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,22 +222,49 @@ def parse(self):
yield Document(chunks=[doc1, doc2], tags={DEFAULT_TAG_SCORE_KEY: col3})


class DataSynthesisParser(_CSVParser):
"""
CSV has either one column or one row, each item in the CSV represents a single
document so the structure of the CSV file is not important.
LMMilliken marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(
self,
file: Union[str, TextIO, StringIO],
task: str,
options: Optional[CSVOptions] = None,
):
super().__init__(file, task, options)

def parse(self):
with self._file_ctx as fp:
lines = csv.reader(fp, dialect=self._options.dialect)

for columns in _subsample(
lines, self._options.size, self._options.sampling_rate
):
for column in columns:
yield Document(text=column)


class CSVContext:
"""
A CSV context switch class with conditions to parse CSVs into DocumentArray.

:param model: The model being used, to get model stub and associated task.
:param options: an instance of :class`CSVOptions`.
:param options: An instance of :class`CSVOptions`.
"""

def __init__(
self,
model: str,
model: Optional[str] = None,
options: Optional[CSVOptions] = None,
):
self._model = model
self._options = options or CSVOptions()
if model == 'mlp':
if not model:
self._task = 'synthesis'
elif model == 'mlp':
self._task = 'image-to-image'
else:
model_stub = get_stub(
Expand All @@ -248,7 +275,11 @@ def __init__(
self._task = model_stub.task

def _get_csv_parser(self, data: Union[str, TextIO]):
if self._options.is_labeled:
if self._task == 'synthesis':
return DataSynthesisParser(
file=data, task=self._task, options=self._options
)
elif self._options.is_labeled:
return LabeledCSVParser(file=data, task=self._task, options=self._options)
else:
_, num_columns = get_csv_file_dialect_columns(
Expand Down Expand Up @@ -400,3 +431,21 @@ def create_document(
doc = Document(content=column)

return doc


@dataclass
class SynthesisModels:
"""Class specifying the models to be used in a data synthesis job.
:param: relation_miner: The name of the model or list of models to use for
relation mining.
:param cross_encoder: The name of the model to use as the cross encoder
"""

relation_miner: Union[str, List[str]]
cross_encoder: str


DATA_SYNTHESIS_EN = SynthesisModels(
relation_miner='sbert-base-en',
cross_encoder='crossencoder-base-en',
)
Loading