jina-ai · LMMilliken · Apr 24, 2023 · Apr 14, 2023 · Apr 14, 2023 · Apr 14, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Add support for data generation jobs. ([#715](https://github.com/jina-ai/finetuner/pull/715))
+
 ### Removed
 
 ### Changed

diff --git a/finetuner/__init__.py b/finetuner/__init__.py
@@ -9,6 +9,7 @@
 from finetuner.constants import (
     DEFAULT_FINETUNER_HOST,
     DEFAULT_HUBBLE_REGISTRY,
+    EMBEDDING,
     HOST,
     HUBBLE_REGISTRY,
 )
@@ -65,9 +66,16 @@ def _build_name_stub_map() -> Dict[str, model_stub.ModelStubType]:
     return rv
 
 
-def list_models() -> List[str]:
-    """List available models."""
-    return [name for name in list_model_classes()]
+def list_models(model_type: str = EMBEDDING) -> List[str]:
+    """List available models.
+
+    :param type: The type of backbone model, one of 'embedding', 'cross_ecoding' or
+        'relation_mining'. 'embedding' by default.
+
+    """
+    return [
+        stub.display_name for stub in list_model_classes(model_type=model_type).values()
+    ]
 
 
 def list_model_options() -> Dict[str, List[Dict[str, Any]]]:
@@ -91,16 +99,19 @@ def list_model_options() -> Dict[str, List[Dict[str, Any]]]:
     }
 
 
-def describe_models(task: Optional[str] = None) -> None:
+def describe_models(task: Optional[str] = None, model_type: str = EMBEDDING) -> None:
     """Print model information, such as name, task, output dimension, architecture
     and description as a table.
 
     :param task: The task for the backbone model, one of `text-to-text`,
         `text-to-image`, `image-to-image`. If not provided, will print all backbone
         models.
+    :param type: The type of backbone model, one of 'embedding', 'cross_ecoding' or
+        'relation_mining'. 'embedding' by default, the `task` parameter will be ignored
+        if this is set to anything else.
 
     """
-    print_model_table(model, task=task)
+    print_model_table(model, task=task, model_type=model_type)
 
 
 @login_required
@@ -137,8 +148,8 @@ def fit(
     loss_optimizer: Optional[str] = None,
     loss_optimizer_options: Optional[Dict[str, Any]] = None,
 ) -> Run:
-    """Create a Finetuner :class:`Run`, calling this function will submit a fine-tuning
-    job to the Jina AI Cloud.
+    """Create a Finetuner training :class:`Run`, calling this function will submit a
+    fine-tuning job to the Jina AI Cloud.
 
     :param model: The name of model to be fine-tuned. Run `finetuner.list_models()` or
         `finetuner.describe_models()` to see the available model names.
@@ -240,7 +251,7 @@ def fit(
        extremely slow and inefficient.
     """
 
-    return ft.create_run(
+    return ft.create_training_run(
         model=model,
         train_data=train_data,
         eval_data=eval_data,
@@ -275,9 +286,82 @@ def fit(
     )
 
 
-# `create_run` and `fit` do the same
+@login_required
+def synthesize(
+    query_data: Union[str, List[str], DocumentArray],
+    corpus_data: Union[str, List[str], DocumentArray],
+    mining_models: Union[str, List[str]],
+    cross_encoder_model: str,
+    num_relations: int,
+    max_num_docs: Optional[int] = None,
+    run_name: Optional[str] = None,
+    description: Optional[str] = None,
+    experiment_name: Optional[str] = None,
+    device: str = 'cuda',
+    num_workers: int = 4,
+    csv_options: Optional[CSVOptions] = None,
+    public: bool = False,
+) -> Run:
+    """Create a Finetuner generation :class:`Run`, calling this function will submit a
+    data generation job to the Jina AI Cloud.
+
+    :param query_data: Either a :class:`DocumentArray` for example queries, name of a
+        `DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as
+        a list of strings or a path to a CSV file.
+    :param corpus_data: Either a :class:`DocumentArray` for corpus data, a name of a
+        `DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as a
+        list of strings or a path to a CSV file.
+    :param mining_models: The name or a list of names of models to be used during
+        relation mining. Run `finetuner.list_models(model_type='relation_mining')` or
+        `finetuner.describe_models(model_type='relation_mining')` to see the
+        available model names.
+    :param cross_encoder_model:  The name of the model to be used as the cross-encoder.
+        Run `finetuner.list_models(model_type='cross_encoding')` or
+        `finetuner.describe_models(model_type='cross_encoding')` to see the
+        available model names.
+    :param num_relations: The number of relations to mine per query.
+    :param max_num_docs: The maximum number of documents to consider.
+    :param run_name: Name of the run.
+    :param: description: Run Description.
+    :param experiment_name: Name of the experiment.
+    :param device: Whether to use the CPU, if set to `cuda`, a Nvidia GPU will be used.
+        otherwise use `cpu` to run a cpu job.
+    :param num_workers: Number of CPU workers. If `cpu: False` this is the number of
+        workers used by the dataloader.
+    :param csv_options: A :class:`CSVOptions` object containing options used for
+        reading in training and evaluation data from a CSV file, if they are
+        provided as such.
+    :param public: A boolean value indicates if the artifact is public. It should be
+        set to `True` if you would like to share your synthesized data with others.
+
+    .. note::
+       Unless necessary, please stick with `device="cuda"`, `cpu` training could be
+       extremely slow and inefficient.
+    """
+    return ft.create_synthesis_run(
+        query_data=query_data,
+        corpus_data=corpus_data,
+        mining_models=mining_models,
+        cross_encoder_model=cross_encoder_model,
+        num_relations=num_relations,
+        max_num_docs=max_num_docs,
+        run_name=run_name,
+        description=description,
+        experiment_name=experiment_name,
+        device=device,
+        num_workers=num_workers,
+        csv_options=csv_options,
+        public=public,
+    )
+
+
+# `create_run`, `create_training_run` and `fit` do the same
+create_training_run = fit
 create_run = fit
 
+# `create_synthesis_run` and `synthesize` do the same
+create_synthesis_run = synthesize
+
 
 def get_run(run_name: str, experiment_name: Optional[str] = None) -> Run:
     """Get a :class:`Run` by its name and (optional) :class:`Experiment` name.

diff --git a/finetuner/client/client.py b/finetuner/client/client.py
@@ -23,6 +23,7 @@
     POST,
     RUNS,
     STATUS,
+    TASK,
 )
 
 _finetuner_core_version = pkg_resources.get_distribution('finetuner-stubs').version
@@ -267,6 +268,7 @@ def create_run(
         experiment_name: str,
         run_name: str,
         run_config: dict,
+        task: str,
         device: str,
         cpus: int,
         gpus: int,
@@ -291,6 +293,7 @@ def create_run(
             json_data={
                 NAME: run_name,
                 CONFIG: run_config,
+                TASK: task,
                 FINETUNER_VERSION: _finetuner_core_version,
                 DEVICE: device,
                 CPUS: cpus,

diff --git a/finetuner/console.py b/finetuner/console.py
@@ -3,12 +3,13 @@
 from rich.console import Console
 from rich.table import Table
 
+from finetuner.constants import EMBEDDING
 from finetuner.model import list_model_classes
 
 console = Console()
 
 
-def print_model_table(model, task: Optional[str] = None):
+def print_model_table(model, task: Optional[str] = None, model_type: str = EMBEDDING):
     """Prints a table of model descriptions.
 
     :param model: Module with model definitions
@@ -24,7 +25,7 @@ def print_model_table(model, task: Optional[str] = None):
     for column in header:
         table.add_column(column, justify='right', style='cyan', no_wrap=False)
 
-    for _, _model_class in list_model_classes().items():
+    for _, _model_class in list_model_classes(model_type=model_type).items():
         if _model_class.display_name not in model_display_names:
             row = model.get_row(_model_class)
             if task and row[1] != task:

diff --git a/finetuner/constants.py b/finetuner/constants.py
@@ -81,3 +81,18 @@
 PUBLIC = 'public'
 NUM_ITEMS_PER_CLASS = 'num_items_per_class'
 VAL_SPLIT = 'val_split'
+TASK = 'task'
+TRAINING = 'training'
+GENERATION = 'generation'
+# Generation job
+RAW_DATA_CONFIG = 'data'
+RELATION_MINING = 'relation_mining'
+CROSS_ENCODER = 'cross_encoder'
+QUERIES = 'queries'
+CORPUS = 'corpus'
+MODELS = 'models'
+NUM_RELATIONS = 'num_relations'
+MAX_NUM_DOCS = 'max_num_docs'
+# Stub types
+EMBEDDING = 'embedding'
+CROSS_ENCODING = 'cross_encoding'
diff --git a/finetuner/data.py b/finetuner/data.py
@@ -222,6 +222,31 @@ def parse(self):
                 yield Document(chunks=[doc1, doc2], tags={DEFAULT_TAG_SCORE_KEY: col3})
 
 
+class DataGenerationParser(_CSVParser):
+    """
+    CSV has either one column or one row, each item in the CSV represents a single
+    document so the structure of the CSV file is not important.
+    """
+
+    def __init__(
+        self,
+        file: Union[str, TextIO, StringIO],
+        task: str,
+        options: Optional[CSVOptions] = None,
+    ):
+        super().__init__(file, task, options)
+
+    def parse(self):
+        with self._file_ctx as fp:
+            lines = csv.reader(fp, dialect=self._options.dialect)
+
+            for columns in _subsample(
+                lines, self._options.size, self._options.sampling_rate
+            ):
+                for column in columns:
+                    yield Document(text=column)
+
+
 class CSVContext:
     """
     A CSV context switch class with conditions to parse CSVs into DocumentArray.
@@ -232,12 +257,15 @@ class CSVContext:
 
     def __init__(
         self,
-        model: str,
+        model: Optional[str] = None,
+        task: Optional[str] = None,
         options: Optional[CSVOptions] = None,
     ):
         self._model = model
         self._options = options or CSVOptions()
-        if model == 'mlp':
+        if not model:
+            self._task = 'generation'
+        elif model == 'mlp':
             self._task = 'image-to-image'
         else:
             model_stub = get_stub(
@@ -248,7 +276,11 @@ def __init__(
             self._task = model_stub.task
 
     def _get_csv_parser(self, data: Union[str, TextIO]):
-        if self._options.is_labeled:
+        if self._task == 'generation':
+            return DataGenerationParser(
+                file=data, task=self._task, options=self._options
+            )
+        elif self._options.is_labeled:
             return LabeledCSVParser(file=data, task=self._task, options=self._options)
         else:
             _, num_columns = get_csv_file_dialect_columns(