Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
feat: support data generation (#715)
Browse files Browse the repository at this point in the history
* feat: support generation

* test: add tests for generation jobs

* chore: remove print statements

* refactor: rename generate to synthesize

* fix: fix failing tests

* chore: update changelog

* feat: update list_models function

* chore: implement review notes

* chore: implement review notes

* feat: add  as attribute of Run

* chore: implement review notes

* fix: fix test

* chore: implement review notes

* chore: implement review notes

* feat: add synthesis models param

* feat: add synthesis models param

* feat: add synthesis models param

* chore: use display names

* chore: implement review notes

* fix: fix push_docarray

* chore: implement review comments
  • Loading branch information
LMMilliken authored Apr 24, 2023
1 parent 6114ef2 commit ea9c62e
Show file tree
Hide file tree
Showing 18 changed files with 642 additions and 69 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Add support for data generation jobs. ([#715](https://github.com/jina-ai/finetuner/pull/715))

### Removed

### Changed
Expand Down
80 changes: 72 additions & 8 deletions finetuner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
HOST,
HUBBLE_REGISTRY,
)
from finetuner.data import CSVOptions
from finetuner.data import CSVOptions, SynthesisModels
from finetuner.run import Run
from hubble import login_required

Expand Down Expand Up @@ -67,13 +67,13 @@ def _build_name_stub_map() -> Dict[str, model_stub.ModelStubType]:

def list_models() -> List[str]:
"""List available models."""
return [name for name in list_model_classes()]
return [_model_class.display_name for _model_class in list_model_classes().values()]


def list_model_options() -> Dict[str, List[Dict[str, Any]]]:
"""List available options per model."""
return {
name: [
_model_class.display_name: [
{
'name': parameter.name,
'type': parameter.annotation,
Expand All @@ -87,7 +87,7 @@ def list_model_options() -> Dict[str, List[Dict[str, Any]]]:
).parameters.values()
if parameter.name != 'self'
]
for name, _model_class in list_model_classes().items()
for _model_class in list_model_classes().values()
}


Expand Down Expand Up @@ -137,8 +137,8 @@ def fit(
loss_optimizer: Optional[str] = None,
loss_optimizer_options: Optional[Dict[str, Any]] = None,
) -> Run:
"""Create a Finetuner :class:`Run`, calling this function will submit a fine-tuning
job to the Jina AI Cloud.
"""Create a Finetuner training :class:`Run`, calling this function will submit a
fine-tuning job to the Jina AI Cloud.
:param model: The name of model to be fine-tuned. Run `finetuner.list_models()` or
`finetuner.describe_models()` to see the available model names.
Expand Down Expand Up @@ -240,7 +240,7 @@ def fit(
extremely slow and inefficient.
"""

return ft.create_run(
return ft.create_training_run(
model=model,
train_data=train_data,
eval_data=eval_data,
Expand Down Expand Up @@ -275,9 +275,73 @@ def fit(
)


# `create_run` and `fit` do the same
@login_required
def synthesize(
query_data: Union[str, List[str], DocumentArray],
corpus_data: Union[str, List[str], DocumentArray],
models: SynthesisModels,
num_relations: int = 3,
run_name: Optional[str] = None,
description: Optional[str] = None,
experiment_name: Optional[str] = None,
device: str = 'cuda',
num_workers: int = 4,
csv_options: Optional[CSVOptions] = None,
public: bool = False,
) -> Run:
"""Create a Finetuner synthesis :class:`Run`, calling this function will submit a
data synthesis job to the Jina AI Cloud.
:param query_data: Either a :class:`DocumentArray` for example queries, name of a
`DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as
a list of strings or a path to a CSV file.
:param corpus_data: Either a :class:`DocumentArray` for corpus data, a name of a
`DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as a
list of strings or a path to a CSV file.
:param models: A :class:`SynthesisModels` object containing the names of
the models used for relation mining and cross encoding.
You can pass `finetuner.data.DATA_SYNTHESIS_EN` for the recommended models for
synthesis based on english data.
:param num_relations: The number of relations to mine per query.
:param run_name: Name of the run.
:param: description: Run Description.
:param experiment_name: Name of the experiment.
:param device: Whether to use the CPU, if set to `cuda`, a Nvidia GPU will be used.
otherwise use `cpu` to run a cpu job.
:param num_workers: Number of CPU workers. If `cpu: False` this is the number of
workers used by the dataloader.
:param csv_options: A :class:`CSVOptions` object containing options used for
reading in training and evaluation data from a CSV file, if they are
provided as such.
:param public: A boolean value indicates if the artifact is public. It should be
set to `True` if you would like to share your synthesized data with others.
.. note::
Unless necessary, please stick with `device="cuda"`, `cpu` training could be
extremely slow and inefficient.
"""
return ft.create_synthesis_run(
query_data=query_data,
corpus_data=corpus_data,
models=models,
num_relations=num_relations,
run_name=run_name,
description=description,
experiment_name=experiment_name,
device=device,
num_workers=num_workers,
csv_options=csv_options,
public=public,
)


# `create_run`, `create_training_run` and `fit` do the same
create_training_run = fit
create_run = fit

# `create_synthesis_run` and `synthesize` do the same
create_synthesis_run = synthesize


def get_run(run_name: str, experiment_name: Optional[str] = None) -> Run:
"""Get a :class:`Run` by its name and (optional) :class:`Experiment` name.
Expand Down
3 changes: 3 additions & 0 deletions finetuner/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
POST,
RUNS,
STATUS,
TASK,
)

_finetuner_core_version = pkg_resources.get_distribution('finetuner-stubs').version
Expand Down Expand Up @@ -267,6 +268,7 @@ def create_run(
experiment_name: str,
run_name: str,
run_config: dict,
task: str,
device: str,
cpus: int,
gpus: int,
Expand All @@ -291,6 +293,7 @@ def create_run(
json_data={
NAME: run_name,
CONFIG: run_config,
TASK: task,
FINETUNER_VERSION: _finetuner_core_version,
DEVICE: device,
CPUS: cpus,
Expand Down
13 changes: 13 additions & 0 deletions finetuner/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,16 @@
PUBLIC = 'public'
NUM_ITEMS_PER_CLASS = 'num_items_per_class'
VAL_SPLIT = 'val_split'
TASK = 'task'
TRAINING_TASK = 'training'
SYNTHESIS_TASK = 'generation'
# Synthesis job
RAW_DATA_CONFIG = 'data'
RELATION_MINING = 'relation_mining'
CROSS_ENCODER = 'cross_encoder'
QUERIES = 'queries'
CORPUS = 'corpus'
MODELS = 'models'
NUM_RELATIONS = 'num_relations'
TRAIN_DATA = 'train_data'
MAX_NUM_DOCS = 'max_num_docs'
57 changes: 53 additions & 4 deletions finetuner/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,22 +222,49 @@ def parse(self):
yield Document(chunks=[doc1, doc2], tags={DEFAULT_TAG_SCORE_KEY: col3})


class DataSynthesisParser(_CSVParser):
"""
CSV has either one column or one row, each item in the CSV represents a single
document so the structure of the CSV file is not important.
"""

def __init__(
self,
file: Union[str, TextIO, StringIO],
task: str,
options: Optional[CSVOptions] = None,
):
super().__init__(file, task, options)

def parse(self):
with self._file_ctx as fp:
lines = csv.reader(fp, dialect=self._options.dialect)

for columns in _subsample(
lines, self._options.size, self._options.sampling_rate
):
for column in columns:
yield Document(text=column)


class CSVContext:
"""
A CSV context switch class with conditions to parse CSVs into DocumentArray.
:param model: The model being used, to get model stub and associated task.
:param options: an instance of :class`CSVOptions`.
:param options: An instance of :class`CSVOptions`.
"""

def __init__(
self,
model: str,
model: Optional[str] = None,
options: Optional[CSVOptions] = None,
):
self._model = model
self._options = options or CSVOptions()
if model == 'mlp':
if not model:
self._task = 'synthesis'
elif model == 'mlp':
self._task = 'image-to-image'
else:
model_stub = get_stub(
Expand All @@ -248,7 +275,11 @@ def __init__(
self._task = model_stub.task

def _get_csv_parser(self, data: Union[str, TextIO]):
if self._options.is_labeled:
if self._task == 'synthesis':
return DataSynthesisParser(
file=data, task=self._task, options=self._options
)
elif self._options.is_labeled:
return LabeledCSVParser(file=data, task=self._task, options=self._options)
else:
_, num_columns = get_csv_file_dialect_columns(
Expand Down Expand Up @@ -400,3 +431,21 @@ def create_document(
doc = Document(content=column)

return doc


@dataclass
class SynthesisModels:
"""Class specifying the models to be used in a data synthesis job.
:param: relation_miner: The name of the model or list of models to use for
relation mining.
:param cross_encoder: The name of the model to use as the cross encoder
"""

relation_miner: Union[str, List[str]]
cross_encoder: str


DATA_SYNTHESIS_EN = SynthesisModels(
relation_miner='sbert-base-en',
cross_encoder='crossencoder-base-en',
)
Loading

0 comments on commit ea9c62e

Please sign in to comment.