Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
chore: implement review notes
Browse files Browse the repository at this point in the history
  • Loading branch information
lmmilliken committed Apr 14, 2023
1 parent 4d627dd commit 88b3fab
Show file tree
Hide file tree
Showing 11 changed files with 46 additions and 111 deletions.
39 changes: 8 additions & 31 deletions finetuner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from finetuner.constants import (
DEFAULT_FINETUNER_HOST,
DEFAULT_HUBBLE_REGISTRY,
EMBEDDING,
HOST,
HUBBLE_REGISTRY,
)
Expand Down Expand Up @@ -66,16 +65,9 @@ def _build_name_stub_map() -> Dict[str, model_stub.ModelStubType]:
return rv


def list_models(model_type: str = EMBEDDING) -> List[str]:
"""List available models.
:param type: The type of backbone model, one of 'embedding', 'cross_ecoding' or
'relation_mining'. 'embedding' by default.
"""
return [
stub.display_name for stub in list_model_classes(model_type=model_type).values()
]
def list_models() -> List[str]:
"""List available models."""
return [stub.display_name for stub in list_model_classes().values()]


def list_model_options() -> Dict[str, List[Dict[str, Any]]]:
Expand All @@ -99,19 +91,16 @@ def list_model_options() -> Dict[str, List[Dict[str, Any]]]:
}


def describe_models(task: Optional[str] = None, model_type: str = EMBEDDING) -> None:
def describe_models(task: Optional[str] = None) -> None:
"""Print model information, such as name, task, output dimension, architecture
and description as a table.
:param task: The task for the backbone model, one of `text-to-text`,
`text-to-image`, `image-to-image`. If not provided, will print all backbone
models.
:param type: The type of backbone model, one of 'embedding', 'cross_ecoding' or
'relation_mining'. 'embedding' by default, the `task` parameter will be ignored
if this is set to anything else.
"""
print_model_table(model, task=task, model_type=model_type)
print_model_table(model, task=task)


@login_required
Expand Down Expand Up @@ -290,9 +279,7 @@ def fit(
def synthesize(
query_data: Union[str, List[str], DocumentArray],
corpus_data: Union[str, List[str], DocumentArray],
mining_models: Union[str, List[str]],
cross_encoder_model: str,
num_relations: int,
num_relations: int = 3,
max_num_docs: Optional[int] = None,
run_name: Optional[str] = None,
description: Optional[str] = None,
Expand All @@ -302,23 +289,15 @@ def synthesize(
csv_options: Optional[CSVOptions] = None,
public: bool = False,
) -> Run:
"""Create a Finetuner generation :class:`Run`, calling this function will submit a
data generation job to the Jina AI Cloud.
"""Create a Finetuner synthesis :class:`Run`, calling this function will submit a
data synthesis job to the Jina AI Cloud.
:param query_data: Either a :class:`DocumentArray` for example queries, name of a
`DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as
a list of strings or a path to a CSV file.
:param corpus_data: Either a :class:`DocumentArray` for corpus data, a name of a
`DocumentArray` that is pushed on Jina AI Cloud, the dataset itself as a
list of strings or a path to a CSV file.
:param mining_models: The name or a list of names of models to be used during
relation mining. Run `finetuner.list_models(model_type='relation_mining')` or
`finetuner.describe_models(model_type='relation_mining')` to see the
available model names.
:param cross_encoder_model: The name of the model to be used as the cross-encoder.
Run `finetuner.list_models(model_type='cross_encoding')` or
`finetuner.describe_models(model_type='cross_encoding')` to see the
available model names.
:param num_relations: The number of relations to mine per query.
:param max_num_docs: The maximum number of documents to consider.
:param run_name: Name of the run.
Expand All @@ -341,8 +320,6 @@ def synthesize(
return ft.create_synthesis_run(
query_data=query_data,
corpus_data=corpus_data,
mining_models=mining_models,
cross_encoder_model=cross_encoder_model,
num_relations=num_relations,
max_num_docs=max_num_docs,
run_name=run_name,
Expand Down
5 changes: 2 additions & 3 deletions finetuner/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@
from rich.console import Console
from rich.table import Table

from finetuner.constants import EMBEDDING
from finetuner.model import list_model_classes

console = Console()


def print_model_table(model, task: Optional[str] = None, model_type: str = EMBEDDING):
def print_model_table(model, task: Optional[str] = None):
"""Prints a table of model descriptions.
:param model: Module with model definitions
Expand All @@ -25,7 +24,7 @@ def print_model_table(model, task: Optional[str] = None, model_type: str = EMBED
for column in header:
table.add_column(column, justify='right', style='cyan', no_wrap=False)

for _, _model_class in list_model_classes(model_type=model_type).items():
for _, _model_class in list_model_classes().items():
if _model_class.display_name not in model_display_names:
row = model.get_row(_model_class)
if task and row[1] != task:
Expand Down
11 changes: 5 additions & 6 deletions finetuner/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,16 @@
NUM_ITEMS_PER_CLASS = 'num_items_per_class'
VAL_SPLIT = 'val_split'
TASK = 'task'
TRAINING = 'training'
GENERATION = 'generation'
# Generation job
TRAINING_TASK = 'training'
GENERATION_TASK = 'generation'
# Synthesis job
RAW_DATA_CONFIG = 'data'
RELATION_MINING = 'relation_mining'
DEFAULT_RELATION_MINER = 'sentence-transformers/msmarco-distilbert-base-v3'
CROSS_ENCODER = 'cross_encoder'
DEFAULT_CROSS_ENCODER = 'cross-encoder/mmarco-mMiniLMv2-L12-H384-v1'
QUERIES = 'queries'
CORPUS = 'corpus'
MODELS = 'models'
NUM_RELATIONS = 'num_relations'
MAX_NUM_DOCS = 'max_num_docs'
# Stub types
EMBEDDING = 'embedding'
CROSS_ENCODING = 'cross_encoding'
8 changes: 4 additions & 4 deletions finetuner/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def parse(self):
yield Document(chunks=[doc1, doc2], tags={DEFAULT_TAG_SCORE_KEY: col3})


class DataGenerationParser(_CSVParser):
class DataSynthesisParser(_CSVParser):
"""
CSV has either one column or one row, each item in the CSV represents a single
document so the structure of the CSV file is not important.
Expand Down Expand Up @@ -264,7 +264,7 @@ def __init__(
self._model = model
self._options = options or CSVOptions()
if not model:
self._task = 'generation'
self._task = 'synthesis'
elif model == 'mlp':
self._task = 'image-to-image'
else:
Expand All @@ -276,8 +276,8 @@ def __init__(
self._task = model_stub.task

def _get_csv_parser(self, data: Union[str, TextIO]):
if self._task == 'generation':
return DataGenerationParser(
if self._task == 'synthesis':
return DataSynthesisParser(
file=data, task=self._task, options=self._options
)
elif self._options.is_labeled:
Expand Down
31 changes: 12 additions & 19 deletions finetuner/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
CALLBACKS,
CONFIG,
CREATED_AT,
DEFAULT_CROSS_ENCODER,
DEFAULT_RELATION_MINER,
DESCRIPTION,
DEVICE,
EPOCHS,
EVAL_DATA,
FREEZE,
GENERATION,
GENERATION_TASK,
LEARNING_RATE,
LOSS,
LOSS_OPTIMIZER,
Expand All @@ -37,11 +39,11 @@
SAMPLER,
SCHEDULER,
SCHEDULER_OPTIONS,
TRAINING,
TRAINING_TASK,
VAL_SPLIT,
)
from finetuner.data import CSVContext, CSVOptions
from finetuner.hubble import push_generation_data, push_training_data
from finetuner.hubble import push_synthesis_data, push_training_data
from finetuner.names import get_random_name
from finetuner.run import Run

Expand Down Expand Up @@ -199,7 +201,7 @@ def create_training_run(
run_name=run_name,
experiment_name=self._name,
run_config=config,
task=TRAINING,
task=TRAINING_TASK,
device=device,
cpus=num_workers,
gpus=1,
Expand All @@ -218,9 +220,7 @@ def create_synthesis_run(
self,
query_data: Union[str, List[str], DocumentArray],
corpus_data: Union[str, List[str], DocumentArray],
mining_models: Union[str, List[str]],
cross_encoder_model: str,
num_relations: int,
num_relations: int = 3,
run_name: Optional[str] = None,
csv_options: Optional[CSVOptions] = None,
**kwargs,
Expand All @@ -239,7 +239,7 @@ def create_synthesis_run(
if isinstance(query_data, str)
else DocumentArray([Document(text=data) for data in corpus_data])
)
query_data, corpus_data = push_generation_data(
query_data, corpus_data = push_synthesis_data(
experiment_name=self._name,
run_name=run_name,
query_data=query_data,
Expand All @@ -249,8 +249,6 @@ def create_synthesis_run(
config = self._create_synthesis_config(
query_data=query_data,
corpus_data=corpus_data,
mining_models=mining_models,
cross_encoder_model=cross_encoder_model,
num_relations=num_relations,
experiment_name=self._name,
run_name=run_name,
Expand All @@ -266,7 +264,7 @@ def create_synthesis_run(
run_name=run_name,
experiment_name=self._name,
run_config=config,
task=GENERATION,
task=GENERATION_TASK,
device=device,
cpus=num_workers,
gpus=1,
Expand Down Expand Up @@ -376,22 +374,17 @@ def _create_finetuning_config(
def _create_synthesis_config(
query_data: str,
corpus_data: str,
mining_models: Union[str, List[str]],
cross_encoder_model: str,
num_relations: int,
experiment_name: str,
run_name: str,
**kwargs,
) -> Dict[str, Any]:
"""Create a generation config for a :class:`Run`.
"""Create a synthesis config for a :class:`Run`.
:param query_data: Name of the :class:`DocumentArray` containing the query data
used during training.
:param corpus_data: Name of the :class:`DocumentArray` containing the corpus
data used during training.
:param mining_models: The name or list of names of models to be used during
relation mining.
:param cross_encoder_model: Name of the cross encoder model.
:param num_relations: Number of relations to mine per query.
:return: Run parameters wrapped up as a config dict.
"""
Expand All @@ -401,13 +394,13 @@ def _create_synthesis_config(
corpus=corpus_data,
)
relation_mining = config.RelationMiningConfig(
models=[mining_models] if isinstance(mining_models, str) else mining_models,
models=[DEFAULT_RELATION_MINER],
num_relations=num_relations,
)
generation_config = config.DataGenerationConfig(
data=data,
relation_mining=relation_mining,
cross_encoder=cross_encoder_model,
cross_encoder=DEFAULT_CROSS_ENCODER,
max_num_docs=kwargs.get(MAX_NUM_DOCS),
public=public,
experiment_name=experiment_name,
Expand Down
8 changes: 2 additions & 6 deletions finetuner/finetuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,7 @@ def create_synthesis_run(
self,
query_data: Union[str, List[str], DocumentArray],
corpus_data: Union[str, List[str], DocumentArray],
mining_models: Union[str, List[str]],
cross_encoder_model: str,
num_relations: int,
num_relations: int = 3,
max_num_docs: Optional[int] = None,
run_name: Optional[str] = None,
description: Optional[str] = None,
Expand All @@ -242,7 +240,7 @@ def create_synthesis_run(
csv_options: Optional[CSVOptions] = None,
public: bool = False,
) -> Run:
"""Create a generation run.
"""Create a synthesis run.
If an experiment name is not specified, the run will be created in the default
experiment.
Expand All @@ -263,8 +261,6 @@ def create_synthesis_run(
return experiment.create_synthesis_run(
query_data=query_data,
corpus_data=corpus_data,
mining_models=mining_models,
cross_encoder_model=cross_encoder_model,
num_relations=num_relations,
max_num_docs=max_num_docs,
run_name=run_name,
Expand Down
4 changes: 2 additions & 2 deletions finetuner/hubble.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,15 @@ def push_training_data(
)


def push_generation_data(
def push_synthesis_data(
experiment_name: str,
run_name: str,
query_data: Union[str, DocumentArray],
corpus_data: Union[str, DocumentArray],
) -> Tuple[Optional[str], ...]:
"""Upload data to Hubble and returns their names.
Uploads all data needed for data generation - query data and corpus data.
Uploads all data needed for data synthesis - query data and corpus data.
Data is given either as a `DocumentArray` or
a name of the `DocumentArray` that is already pushed to Hubble.
Expand Down
17 changes: 3 additions & 14 deletions finetuner/model.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
from _finetuner.runner.stubs import model
from _finetuner.runner.stubs.model import * # noqa F401
from _finetuner.runner.stubs.model import (
_CrossEncoderStub,
_EmbeddingModelStub,
_TextTransformerStub,
)

from finetuner.constants import CROSS_ENCODING, EMBEDDING, RELATION_MINING
from _finetuner.runner.stubs.model import _EmbeddingModelStub


def get_header() -> Tuple[str, ...]:
Expand All @@ -25,15 +19,10 @@ def get_row(model_stub) -> Tuple[str, ...]:
)


def list_model_classes(model_type: str = EMBEDDING) -> Dict[str, ModelStubType]:
def list_model_classes() -> Dict[str, ModelStubType]:
rv = {}
members = inspect.getmembers(model, inspect.isclass)
if model_type == EMBEDDING:
parent_class = _EmbeddingModelStub
elif model_type == CROSS_ENCODING:
parent_class = _CrossEncoderStub
elif model_type == RELATION_MINING:
parent_class = _TextTransformerStub
parent_class = _EmbeddingModelStub
for name, stub in members:
if (
name != 'MLPStub'
Expand Down
Loading

0 comments on commit 88b3fab

Please sign in to comment.