Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

feat: support data generation #715

Merged
merged 24 commits into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Add support for data generation jobs. ([#715](https://github.com/jina-ai/finetuner/pull/715))

### Removed

### Changed
Expand Down
71 changes: 66 additions & 5 deletions finetuner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def _build_name_stub_map() -> Dict[str, model_stub.ModelStubType]:

def list_models() -> List[str]:
"""List available models."""
return [name for name in list_model_classes()]
return [stub.display_name for stub in list_model_classes().values()]
LMMilliken marked this conversation as resolved.
Show resolved Hide resolved


def list_model_options() -> Dict[str, List[Dict[str, Any]]]:
Expand Down Expand Up @@ -137,8 +137,8 @@ def fit(
loss_optimizer: Optional[str] = None,
loss_optimizer_options: Optional[Dict[str, Any]] = None,
) -> Run:
"""Create a Finetuner :class:`Run`, calling this function will submit a fine-tuning
job to the Jina AI Cloud.
"""Create a Finetuner training :class:`Run`, calling this function will submit a
fine-tuning job to the Jina AI Cloud.

:param model: The name of model to be fine-tuned. Run `finetuner.list_models()` or
`finetuner.describe_models()` to see the available model names.
Expand Down Expand Up @@ -240,7 +240,7 @@ def fit(
extremely slow and inefficient.
"""

return ft.create_run(
return ft.create_training_run(
model=model,
train_data=train_data,
eval_data=eval_data,
Expand Down Expand Up @@ -275,9 +275,70 @@ def fit(
)


# `create_run` and `fit` do the same
@login_required
def synthesize(
query_data: Union[str, List[str], DocumentArray],
corpus_data: Union[str, List[str], DocumentArray],
num_relations: int = 3,
max_num_docs: Optional[int] = None,
run_name: Optional[str] = None,
description: Optional[str] = None,
experiment_name: Optional[str] = None,
device: str = 'cuda',
num_workers: int = 4,
csv_options: Optional[CSVOptions] = None,
public: bool = False,
) -> Run:
"""Create a Finetuner synthesis :class:`Run`, calling this function will submit a
data synthesis job to the Jina AI Cloud.

:param query_data: Either a :class:`DocumentArray` for example queries, name of a
`DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as
a list of strings or a path to a CSV file.
:param corpus_data: Either a :class:`DocumentArray` for corpus data, a name of a
`DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as a
list of strings or a path to a CSV file.
:param num_relations: The number of relations to mine per query.
:param max_num_docs: The maximum number of documents to consider.
:param run_name: Name of the run.
:param: description: Run Description.
:param experiment_name: Name of the experiment.
:param device: Whether to use the CPU, if set to `cuda`, a Nvidia GPU will be used.
otherwise use `cpu` to run a cpu job.
:param num_workers: Number of CPU workers. If `cpu: False` this is the number of
workers used by the dataloader.
:param csv_options: A :class:`CSVOptions` object containing options used for
reading in training and evaluation data from a CSV file, if they are
provided as such.
:param public: A boolean value indicates if the artifact is public. It should be
set to `True` if you would like to share your synthesized data with others.

.. note::
Unless necessary, please stick with `device="cuda"`, `cpu` training could be
extremely slow and inefficient.
"""
return ft.create_synthesis_run(
query_data=query_data,
corpus_data=corpus_data,
num_relations=num_relations,
max_num_docs=max_num_docs,
run_name=run_name,
description=description,
experiment_name=experiment_name,
device=device,
num_workers=num_workers,
csv_options=csv_options,
public=public,
)


# `create_run`, `create_training_run` and `fit` do the same
create_training_run = fit
create_run = fit

# `create_synthesis_run` and `synthesize` do the same
create_synthesis_run = synthesize


def get_run(run_name: str, experiment_name: Optional[str] = None) -> Run:
"""Get a :class:`Run` by its name and (optional) :class:`Experiment` name.
Expand Down
3 changes: 3 additions & 0 deletions finetuner/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
POST,
RUNS,
STATUS,
TASK,
)

_finetuner_core_version = pkg_resources.get_distribution('finetuner-stubs').version
Expand Down Expand Up @@ -267,6 +268,7 @@ def create_run(
experiment_name: str,
run_name: str,
run_config: dict,
task: str,
device: str,
cpus: int,
gpus: int,
Expand All @@ -291,6 +293,7 @@ def create_run(
json_data={
NAME: run_name,
CONFIG: run_config,
TASK: task,
FINETUNER_VERSION: _finetuner_core_version,
DEVICE: device,
CPUS: cpus,
Expand Down
15 changes: 15 additions & 0 deletions finetuner/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,18 @@
PUBLIC = 'public'
NUM_ITEMS_PER_CLASS = 'num_items_per_class'
VAL_SPLIT = 'val_split'
TASK = 'task'
TRAINING_TASK = 'training'
SYNTHESIS_TASK = 'generation'
# Synthesis job
RAW_DATA_CONFIG = 'data'
RELATION_MINING = 'relation_mining'
DEFAULT_RELATION_MINER = 'sentence-transformers/msmarco-distilbert-base-v3'
CROSS_ENCODER = 'cross_encoder'
DEFAULT_CROSS_ENCODER = 'cross-encoder/mmarco-mMiniLMv2-L12-H384-v1'
QUERIES = 'queries'
CORPUS = 'corpus'
MODELS = 'models'
NUM_RELATIONS = 'num_relations'
MAX_NUM_DOCS = 'max_num_docs'
TRAIN_DATA = 'train_data'
38 changes: 35 additions & 3 deletions finetuner/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,31 @@ def parse(self):
yield Document(chunks=[doc1, doc2], tags={DEFAULT_TAG_SCORE_KEY: col3})


class DataSynthesisParser(_CSVParser):
"""
CSV has either one column or one row, each item in the CSV represents a single
document so the structure of the CSV file is not important.
LMMilliken marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(
self,
file: Union[str, TextIO, StringIO],
task: str,
options: Optional[CSVOptions] = None,
):
super().__init__(file, task, options)

def parse(self):
with self._file_ctx as fp:
lines = csv.reader(fp, dialect=self._options.dialect)

for columns in _subsample(
lines, self._options.size, self._options.sampling_rate
):
for column in columns:
yield Document(text=column)


class CSVContext:
"""
A CSV context switch class with conditions to parse CSVs into DocumentArray.
Expand All @@ -232,12 +257,15 @@ class CSVContext:

def __init__(
self,
model: str,
model: Optional[str] = None,
task: Optional[str] = None,
LMMilliken marked this conversation as resolved.
Show resolved Hide resolved
options: Optional[CSVOptions] = None,
):
self._model = model
self._options = options or CSVOptions()
if model == 'mlp':
if not model:
self._task = 'synthesis'
elif model == 'mlp':
self._task = 'image-to-image'
else:
model_stub = get_stub(
Expand All @@ -248,7 +276,11 @@ def __init__(
self._task = model_stub.task

def _get_csv_parser(self, data: Union[str, TextIO]):
if self._options.is_labeled:
if self._task == 'synthesis':
return DataSynthesisParser(
file=data, task=self._task, options=self._options
)
elif self._options.is_labeled:
return LabeledCSVParser(file=data, task=self._task, options=self._options)
else:
_, num_columns = get_csv_file_dialect_columns(
Expand Down
Loading