diff --git a/finetuner/__init__.py b/finetuner/__init__.py index e0d0773bd..922726175 100644 --- a/finetuner/__init__.py +++ b/finetuner/__init__.py @@ -10,7 +10,7 @@ if TYPE_CHECKING: from .helper import AnyDNN, DocumentArrayLike - from .tuner.summary import SummaryCollection + from .tuner.summary import Summary # fit interface generated from Tuner @@ -26,7 +26,7 @@ def fit( optimizer: str = 'adam', optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', -) -> 'SummaryCollection': +) -> 'Summary': ... @@ -49,7 +49,7 @@ def fit( output_dim: Optional[int] = None, freeze: bool = False, device: str = 'cpu', -) -> 'SummaryCollection': +) -> 'Summary': ... @@ -97,7 +97,7 @@ def fit( def fit( model: 'AnyDNN', train_data: 'DocumentArrayLike', *args, **kwargs -) -> Optional['SummaryCollection']: +) -> Optional['Summary']: if kwargs.get('to_embedding_model', False): from .tailor import to_embedding_model diff --git a/finetuner/tuner/__init__.py b/finetuner/tuner/__init__.py index 06752ff82..4be3f647d 100644 --- a/finetuner/tuner/__init__.py +++ b/finetuner/tuner/__init__.py @@ -4,7 +4,7 @@ if TYPE_CHECKING: from .base import BaseTuner - from .summary import SummaryCollection + from .summary import Summary def _get_tuner_class(dnn_model: AnyDNN) -> Type['BaseTuner']: @@ -36,7 +36,7 @@ def fit( optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', **kwargs, -) -> 'SummaryCollection': +) -> 'Summary': """Finetune the model on the training data. :param embed_model: an embedding model diff --git a/finetuner/tuner/base.py b/finetuner/tuner/base.py index e34a98fdb..3bd5d24fb 100644 --- a/finetuner/tuner/base.py +++ b/finetuner/tuner/base.py @@ -9,7 +9,7 @@ ) from ..helper import AnyDNN, AnyDataLoader, AnyOptimizer, DocumentArrayLike -from .summary import SummaryCollection +from .summary import Summary class BaseLoss: @@ -101,7 +101,7 @@ def fit( batch_size: int = 256, *args, **kwargs, - ) -> SummaryCollection: + ) -> Summary: """Fit the :py:attr:`.embed_model` on labeled data. Note that fitting changes the weights in :py:attr:`.embed_model` in-place. This allows one to consecutively diff --git a/finetuner/tuner/keras/__init__.py b/finetuner/tuner/keras/__init__.py index 3f388f3a2..6a9e4ad68 100644 --- a/finetuner/tuner/keras/__init__.py +++ b/finetuner/tuner/keras/__init__.py @@ -8,7 +8,7 @@ from . import losses, datasets from ..base import BaseTuner, BaseLoss from ..dataset.helper import get_dataset -from ..summary import ScalarSummary, SummaryCollection +from ..summary import ScalarSequence, Summary from ...helper import DocumentArrayLike, AnyDataLoader @@ -64,10 +64,10 @@ def _get_optimizer( def _train( self, data: AnyDataLoader, optimizer: Optimizer, description: str - ) -> ScalarSummary: + ) -> ScalarSequence: """Train the model on given labeled data""" - _summary = ScalarSummary('Train Loss') + _summary = ScalarSequence('Train Loss') with ProgressBar( description, message_on_done=_summary.__str__, @@ -96,11 +96,11 @@ def _eval( self, data: AnyDataLoader, description: str = 'Evaluating', - train_loss: Optional[ScalarSummary] = None, - ) -> ScalarSummary: + train_loss: Optional[ScalarSequence] = None, + ) -> ScalarSequence: """Evaluate the model on given labeled data""" - _summary = ScalarSummary('Eval Loss') + _summary = ScalarSequence('Eval Loss') with ProgressBar( description, @@ -130,7 +130,7 @@ def fit( optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', **kwargs, - ) -> SummaryCollection: + ) -> Summary: """Finetune the model on the training data. :param train_data: Data on which to train the model @@ -171,8 +171,8 @@ def fit( _optimizer = self._get_optimizer(optimizer, optimizer_kwargs, learning_rate) - m_train_loss = ScalarSummary('train') - m_eval_loss = ScalarSummary('eval') + m_train_loss = ScalarSequence('train') + m_eval_loss = ScalarSequence('eval') with get_device(device): for epoch in range(epochs): @@ -187,7 +187,7 @@ def fit( le = self._eval(_eval_data, train_loss=m_train_loss) m_eval_loss += le - return SummaryCollection(m_train_loss, m_eval_loss) + return Summary(m_train_loss, m_eval_loss) def save(self, *args, **kwargs): """Save the embedding model. diff --git a/finetuner/tuner/paddle/__init__.py b/finetuner/tuner/paddle/__init__.py index f999c7531..47d99eb2f 100644 --- a/finetuner/tuner/paddle/__init__.py +++ b/finetuner/tuner/paddle/__init__.py @@ -8,7 +8,7 @@ from . import losses, datasets from ..base import BaseTuner, BaseLoss from ..dataset.helper import get_dataset -from ..summary import ScalarSummary, SummaryCollection +from ..summary import ScalarSequence, Summary from ...helper import DocumentArrayLike, AnyDataLoader @@ -63,13 +63,13 @@ def _eval( self, data: AnyDataLoader, description: str = 'Evaluating', - train_loss: Optional[ScalarSummary] = None, - ) -> ScalarSummary: + train_loss: Optional[ScalarSequence] = None, + ) -> ScalarSequence: """Evaluate the model on given labeled data""" self._embed_model.eval() - _summary = ScalarSummary('Eval Loss') + _summary = ScalarSequence('Eval Loss') with ProgressBar( description, @@ -90,12 +90,12 @@ def _eval( def _train( self, data: AnyDataLoader, optimizer: Optimizer, description: str - ) -> ScalarSummary: + ) -> ScalarSequence: """Train the model on given labeled data""" self._embed_model.train() - _summary = ScalarSummary('Train Loss') + _summary = ScalarSequence('Train Loss') with ProgressBar( description, message_on_done=_summary.__str__, @@ -130,7 +130,7 @@ def fit( optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', **kwargs, - ) -> SummaryCollection: + ) -> Summary: """Finetune the model on the training data. :param train_data: Data on which to train the model @@ -164,8 +164,8 @@ def fit( _optimizer = self._get_optimizer(optimizer, optimizer_kwargs, learning_rate) - m_train_loss = ScalarSummary('train') - m_eval_loss = ScalarSummary('eval') + m_train_loss = ScalarSequence('train') + m_eval_loss = ScalarSequence('eval') for epoch in range(epochs): _data = self._get_data_loader( @@ -186,7 +186,7 @@ def fit( le = self._eval(_data, train_loss=m_train_loss) m_eval_loss += le - return SummaryCollection(m_train_loss, m_eval_loss) + return Summary(m_train_loss, m_eval_loss) def save(self, *args, **kwargs): """Save the embedding model. diff --git a/finetuner/tuner/pytorch/__init__.py b/finetuner/tuner/pytorch/__init__.py index f99f7905d..6daa900d8 100644 --- a/finetuner/tuner/pytorch/__init__.py +++ b/finetuner/tuner/pytorch/__init__.py @@ -8,7 +8,7 @@ from . import losses, datasets from ..base import BaseTuner, BaseLoss from ..dataset.helper import get_dataset -from ..summary import ScalarSummary, SummaryCollection +from ..summary import ScalarSequence, Summary from ...helper import DocumentArrayLike, AnyDataLoader @@ -67,13 +67,13 @@ def _eval( self, data: AnyDataLoader, description: str = 'Evaluating', - train_loss: Optional[ScalarSummary] = None, - ) -> ScalarSummary: + train_loss: Optional[ScalarSequence] = None, + ) -> ScalarSequence: """Evaluate the model on given labeled data""" self._embed_model.eval() - _summary = ScalarSummary('Eval Loss') + _summary = ScalarSequence('Eval Loss') with ProgressBar( description, @@ -99,12 +99,12 @@ def _eval( def _train( self, data: AnyDataLoader, optimizer: Optimizer, description: str - ) -> ScalarSummary: + ) -> ScalarSequence: """Train the model on given labeled data""" self._embed_model.train() - _summary = ScalarSummary('Train Loss') + _summary = ScalarSequence('Train Loss') with ProgressBar( description, message_on_done=_summary.__str__, @@ -142,7 +142,7 @@ def fit( optimizer_kwargs: Optional[Dict] = None, device: str = 'cpu', **kwargs, - ) -> SummaryCollection: + ) -> Summary: """Finetune the model on the training data. :param train_data: Data on which to train the model @@ -179,8 +179,8 @@ def fit( # Get optimizer _optimizer = self._get_optimizer(optimizer, optimizer_kwargs, learning_rate) - m_train_loss = ScalarSummary('train') - m_eval_loss = ScalarSummary('eval') + m_train_loss = ScalarSequence('train') + m_eval_loss = ScalarSequence('eval') for epoch in range(epochs): _data = self._get_data_loader( @@ -201,7 +201,7 @@ def fit( le = self._eval(_data, train_loss=m_train_loss) m_eval_loss += le - return SummaryCollection(m_train_loss, m_eval_loss) + return Summary(m_train_loss, m_eval_loss) def save(self, *args, **kwargs): """Save the embedding model. diff --git a/finetuner/tuner/summary.py b/finetuner/tuner/summary.py index 326b0fa9c..b84aa40dc 100644 --- a/finetuner/tuner/summary.py +++ b/finetuner/tuner/summary.py @@ -8,43 +8,45 @@ ] #: The type of numerics including numpy data type -class ScalarSummary: - def __init__(self, name: str = '', data: Optional[List[NumericType]] = None): +class ScalarSequence: + def __init__(self, name: str): """Create a record for storing a list of scalar values e.g. losses/metrics :param name: the name of that record - :param data: the data record to initialize from """ - self._name = name or '' - self._record = data or [] + self.name = name + self._record = [] - def __iadd__(self, other: Union[List[NumericType], float, 'ScalarSummary']): + def __iadd__(self, other: Union[List[NumericType], float, 'ScalarSequence']): if isinstance(other, list): self._record += other - elif isinstance(other, ScalarSummary): + elif isinstance(other, ScalarSequence): self._record += other._record + elif isinstance(other, np.ndarray) and np.squeeze(other).ndim == 1: + self._record += [v for v in np.squeeze(other)] else: self._record.append(other) return self def __str__(self): if self._record: - return ( - f'{self._name}: {np.mean([float(loss) for loss in self._record]):.2f}' - ) + return f'{self.name}: {np.mean([float(loss) for loss in self._record]):.2f}' else: - return f'{self._name} has no record' + return f'{self.name} has no record' def floats(self) -> List[NumericType]: """Return all numbers as a list of Python native float """ return [float(v) for v in self._record] + def __bool__(self): + return bool(self._record) + -class SummaryCollection: - def __init__(self, *records: ScalarSummary): +class Summary: + def __init__(self, *records: ScalarSequence): """Create a collection of summaries. """ - self._records = records + self._records = [r for r in records if r] def save(self, filepath: str): """Store all summary into a JSON file""" @@ -56,4 +58,57 @@ def save(self, filepath: str): def dict(self) -> Dict[str, List[NumericType]]: """Return all summaries as a Dictionary, where key is the name and value is the record""" - return {r._name: r.floats() for r in self._records} + return {r.name: r.floats() for r in self._records} + + def plot( + self, + output: Optional[str] = None, + max_plot_points: Optional[int] = None, + **kwargs, + ): + """Plot all records in the summary into one plot. + + .. note:: + This function requires ``matplotlib`` to be installed. + + :param output: Optional path to store the visualization. If not given, show in UI + :param max_plot_points: the maximum number of points to plot. When the actual number of plots is larger than + given number, then a linspace sampling is conducted first to get the actual number of points for plotting. + :param kwargs: extra kwargs pass to matplotlib.plot + """ + import matplotlib.pyplot as plt + + fig, axes = plt.subplots( + 1, + len(self._records), + figsize=(6 * len(self._records), 6), + constrained_layout=True, + ) + if not isinstance(axes, np.ndarray): + # when only one record, axes is not a list, so wrap it + axes = [axes] + + plt_kwargs = dict(alpha=0.8, linewidth=1) + plt_kwargs.update(kwargs) + + for idx, record in enumerate(self._records): + axes[idx].plot( + *self._sample_points(record.floats(), max_len=max_plot_points), + **plt_kwargs, + ) + axes[idx].set_ylabel(record.name) + axes[idx].set_box_aspect(1) + axes[idx].set_xlabel('Steps') + + if output: + plt.savefig(output, bbox_inches='tight', pad_inches=0.1) + else: + plt.show() + + @staticmethod + def _sample_points(arr, max_len: int): + if not max_len or max_len > len(arr): + return list(range(0, len(arr))), arr + else: + idx = np.round(np.linspace(0, len(arr) - 1, max_len)).astype(int) + return idx, [arr[j] for j in idx] diff --git a/setup.py b/setup.py index c2292ecc1..14cb358d9 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ long_description_content_type='text/markdown', zip_safe=False, setup_requires=['setuptools>=18.0', 'wheel'], - install_requires=['jina>=2.1.11'], + install_requires=['jina>=2.1.11', 'matplotlib'], classifiers=[ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', diff --git a/tests/unit/tuner/test_summary.py b/tests/unit/tuner/test_summary.py new file mode 100644 index 000000000..7e81efefd --- /dev/null +++ b/tests/unit/tuner/test_summary.py @@ -0,0 +1,23 @@ +import os.path + +from finetuner.tuner.summary import ScalarSequence, Summary +import numpy as np + + +def test_summary(tmpdir): + s1 = ScalarSequence('s1') + s2 = ScalarSequence('s2') + s1 += np.random.random((10,)) + assert s1.floats() + s2 += np.random.random((100,)) + assert s1.floats() + + s3 = ScalarSequence('empty') + + sm = Summary(s1, s2, s3) + assert len(sm.dict()) == 2 #: empty record is not counted + sm.plot() + sm.plot(max_plot_points=5) + sm.plot(max_plot_points=1000) + sm.plot(tmpdir / 'sm.png') + assert os.path.exists(tmpdir / 'sm.png')