jina-ai · LMMilliken · Apr 24, 2023 · Apr 14, 2023 · Apr 14, 2023 · Apr 14, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Add support for data generation jobs. ([#715](https://github.com/jina-ai/finetuner/pull/715))
+
 ### Removed
 
 ### Changed

diff --git a/finetuner/__init__.py b/finetuner/__init__.py
@@ -67,7 +67,7 @@ def _build_name_stub_map() -> Dict[str, model_stub.ModelStubType]:
 
 def list_models() -> List[str]:
     """List available models."""
-    return [name for name in list_model_classes()]
+    return [stub.display_name for stub in list_model_classes().values()]
 
 
 def list_model_options() -> Dict[str, List[Dict[str, Any]]]:
@@ -137,8 +137,8 @@ def fit(
     loss_optimizer: Optional[str] = None,
     loss_optimizer_options: Optional[Dict[str, Any]] = None,
 ) -> Run:
-    """Create a Finetuner :class:`Run`, calling this function will submit a fine-tuning
-    job to the Jina AI Cloud.
+    """Create a Finetuner training :class:`Run`, calling this function will submit a
+    fine-tuning job to the Jina AI Cloud.
 
     :param model: The name of model to be fine-tuned. Run `finetuner.list_models()` or
         `finetuner.describe_models()` to see the available model names.
@@ -240,7 +240,7 @@ def fit(
        extremely slow and inefficient.
     """
 
-    return ft.create_run(
+    return ft.create_training_run(
         model=model,
         train_data=train_data,
         eval_data=eval_data,
@@ -275,9 +275,70 @@ def fit(
     )
 
 
-# `create_run` and `fit` do the same
+@login_required
+def synthesize(
+    query_data: Union[str, List[str], DocumentArray],
+    corpus_data: Union[str, List[str], DocumentArray],
+    num_relations: int = 3,
+    max_num_docs: Optional[int] = None,
+    run_name: Optional[str] = None,
+    description: Optional[str] = None,
+    experiment_name: Optional[str] = None,
+    device: str = 'cuda',
+    num_workers: int = 4,
+    csv_options: Optional[CSVOptions] = None,
+    public: bool = False,
+) -> Run:
+    """Create a Finetuner synthesis :class:`Run`, calling this function will submit a
+    data synthesis job to the Jina AI Cloud.
+
+    :param query_data: Either a :class:`DocumentArray` for example queries, name of a
+        `DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as
+        a list of strings or a path to a CSV file.
+    :param corpus_data: Either a :class:`DocumentArray` for corpus data, a name of a
+        `DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as a
+        list of strings or a path to a CSV file.
+    :param num_relations: The number of relations to mine per query.
+    :param max_num_docs: The maximum number of documents to consider.
+    :param run_name: Name of the run.
+    :param: description: Run Description.
+    :param experiment_name: Name of the experiment.
+    :param device: Whether to use the CPU, if set to `cuda`, a Nvidia GPU will be used.
+        otherwise use `cpu` to run a cpu job.
+    :param num_workers: Number of CPU workers. If `cpu: False` this is the number of
+        workers used by the dataloader.
+    :param csv_options: A :class:`CSVOptions` object containing options used for
+        reading in training and evaluation data from a CSV file, if they are
+        provided as such.
+    :param public: A boolean value indicates if the artifact is public. It should be
+        set to `True` if you would like to share your synthesized data with others.
+
+    .. note::
+       Unless necessary, please stick with `device="cuda"`, `cpu` training could be
+       extremely slow and inefficient.
+    """
+    return ft.create_synthesis_run(
+        query_data=query_data,
+        corpus_data=corpus_data,
+        num_relations=num_relations,
+        max_num_docs=max_num_docs,
+        run_name=run_name,
+        description=description,
+        experiment_name=experiment_name,
+        device=device,
+        num_workers=num_workers,
+        csv_options=csv_options,
+        public=public,
+    )
+
+
+# `create_run`, `create_training_run` and `fit` do the same
+create_training_run = fit
 create_run = fit
 
+# `create_synthesis_run` and `synthesize` do the same
+create_synthesis_run = synthesize
+
 
 def get_run(run_name: str, experiment_name: Optional[str] = None) -> Run:
     """Get a :class:`Run` by its name and (optional) :class:`Experiment` name.

diff --git a/finetuner/client/client.py b/finetuner/client/client.py
@@ -23,6 +23,7 @@
     POST,
     RUNS,
     STATUS,
+    TASK,
 )
 
 _finetuner_core_version = pkg_resources.get_distribution('finetuner-stubs').version
@@ -267,6 +268,7 @@ def create_run(
         experiment_name: str,
         run_name: str,
         run_config: dict,
+        task: str,
         device: str,
         cpus: int,
         gpus: int,
@@ -291,6 +293,7 @@ def create_run(
             json_data={
                 NAME: run_name,
                 CONFIG: run_config,
+                TASK: task,
                 FINETUNER_VERSION: _finetuner_core_version,
                 DEVICE: device,
                 CPUS: cpus,

diff --git a/finetuner/constants.py b/finetuner/constants.py
@@ -81,3 +81,18 @@
 PUBLIC = 'public'
 NUM_ITEMS_PER_CLASS = 'num_items_per_class'
 VAL_SPLIT = 'val_split'
+TASK = 'task'
+TRAINING_TASK = 'training'
+SYNTHESIS_TASK = 'generation'
+# Synthesis job
+RAW_DATA_CONFIG = 'data'
+RELATION_MINING = 'relation_mining'
+DEFAULT_RELATION_MINER = 'sentence-transformers/msmarco-distilbert-base-v3'
+CROSS_ENCODER = 'cross_encoder'
+DEFAULT_CROSS_ENCODER = 'cross-encoder/mmarco-mMiniLMv2-L12-H384-v1'
+QUERIES = 'queries'
+CORPUS = 'corpus'
+MODELS = 'models'
+NUM_RELATIONS = 'num_relations'
+MAX_NUM_DOCS = 'max_num_docs'
+TRAIN_DATA = 'train_data'
diff --git a/finetuner/data.py b/finetuner/data.py
@@ -222,6 +222,31 @@ def parse(self):
                 yield Document(chunks=[doc1, doc2], tags={DEFAULT_TAG_SCORE_KEY: col3})
 
 
+class DataSynthesisParser(_CSVParser):
+    """
+    CSV has either one column or one row, each item in the CSV represents a single
+    document so the structure of the CSV file is not important.
+    """
+
+    def __init__(
+        self,
+        file: Union[str, TextIO, StringIO],
+        task: str,
+        options: Optional[CSVOptions] = None,
+    ):
+        super().__init__(file, task, options)
+
+    def parse(self):
+        with self._file_ctx as fp:
+            lines = csv.reader(fp, dialect=self._options.dialect)
+
+            for columns in _subsample(
+                lines, self._options.size, self._options.sampling_rate
+            ):
+                for column in columns:
+                    yield Document(text=column)
+
+
 class CSVContext:
     """
     A CSV context switch class with conditions to parse CSVs into DocumentArray.
@@ -232,12 +257,15 @@ class CSVContext:
 
     def __init__(
         self,
-        model: str,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
         options: Optional[CSVOptions] = None,
     ):
         self._model = model
         self._options = options or CSVOptions()
-        if model == 'mlp':
+        if not model:
+            self._task = 'synthesis'
+        elif model == 'mlp':
             self._task = 'image-to-image'
         else:
             model_stub = get_stub(
@@ -248,7 +276,11 @@ def __init__(
             self._task = model_stub.task
 
     def _get_csv_parser(self, data: Union[str, TextIO]):
-        if self._options.is_labeled:
+        if self._task == 'synthesis':
+            return DataSynthesisParser(
+                file=data, task=self._task, options=self._options
+            )
+        elif self._options.is_labeled:
             return LabeledCSVParser(file=data, task=self._task, options=self._options)
         else:
             _, num_columns = get_csv_file_dialect_columns(