TensorMap interpretation field (#101)

* default tensor from file mostly removed, group is gone, source becomes path_prefix
broadinstitute · Feb 4, 2020 · eea8602 · eea8602
1 parent 431e458
commit eea8602
Show file tree

Hide file tree

Showing 15 changed files with 7,094 additions and 7,999 deletions.
diff --git a/ml4cvd/TensorMap.py b/ml4cvd/TensorMap.py
diff --git a/ml4cvd/arguments.py b/ml4cvd/arguments.py
@@ -23,7 +23,7 @@
 from ml4cvd.TensorMap import TensorMap
 from ml4cvd.tensor_maps_by_hand import TMAPS
 from ml4cvd.defines import IMPUTATION_RANDOM, IMPUTATION_MEAN
-from ml4cvd.tensor_map_maker import generate_multi_field_continuous_tensor_map, generate_continuous_tensor_map_from_file
+from ml4cvd.tensor_map_maker import generate_continuous_tensor_map_from_file
 
 
 def parse_args():
@@ -38,7 +38,6 @@ def parse_args():
     # Tensor Map arguments
     parser.add_argument('--input_tensors', default=[], nargs='+')
     parser.add_argument('--output_tensors', default=[], nargs='+')
-    parser.add_argument('--input_continuous_tensors', default=[], nargs='+', help='Continuous tensor maps to be combined.')
     parser.add_argument('--tensor_maps_in', default=[], help='Do not set this directly. Use input_tensors')
     parser.add_argument('--tensor_maps_out', default=[], help='Do not set this directly. Use output_tensors')
 
@@ -194,10 +193,6 @@ def _process_args(args):
             f.write(k + ' = ' + str(v) + '\n')
     load_config(args.logging_level, os.path.join(args.output_folder, args.id), 'log_' + now_string, args.min_sample_id)
     args.tensor_maps_in = [_get_tmap(it) for it in args.input_tensors]
-    if len(args.input_continuous_tensors) > 0:
-        multi_field_tensor_map = [generate_multi_field_continuous_tensor_map(args.input_continuous_tensors, args.include_missing_continuous_channel,
-                                                                             args.imputation_method_for_continuous_fields)]
-        args.tensor_maps_in.extend(multi_field_tensor_map)
 
     args.tensor_maps_out = []
     if args.continuous_file is not None:

diff --git a/ml4cvd/defines.py b/ml4cvd/defines.py
@@ -2,15 +2,16 @@
 from enum import Enum, auto
 
 
-class DataSetType(Enum):
-    FLOAT_ARRAY = auto()
+class StorageType(Enum):
     CONTINUOUS = auto()
-    CATEGORICAL = auto()
+    CATEGORICAL_INDEX = auto()
+    CATEGORICAL_FLAG = auto()
+    ONE_HOT = auto()
     STRING = auto()
-    SERIES = auto()
+    BYTE_STRING = auto()
 
     def __str__(self):
-        """DataSetType.FLOAT_ARRAY becomes float_array"""
+        """StorageType.FLOAT_ARRAY becomes float_array"""
         return str.lower(super().__str__().split('.')[1])
 
 
@@ -23,6 +24,7 @@ def __str__(self):
 PDF_EXT = '.pdf'
 TENSOR_EXT = '.hd5'
 
+STOP_CHAR = '!'
 JOIN_CHAR = '_'
 CONCAT_CHAR = '-'
 HD5_GROUP_CHAR = '/'
@@ -43,7 +45,7 @@ def __str__(self):
 MRI_PATIENT_ORIENTATION = 'mri_patient_orientation'
 MRI_SEGMENTED_CHANNEL_MAP = {'background': 0, 'ventricle': 1, 'myocardium': 2}
 MRI_ANNOTATION_CHANNEL_MAP = {'good': 0, 'included-lvot': 1, 'mistrace': 2, 'phantom-apex': 3, 'hardclip': 4}
-MRI_LAX_3CH_SEGMENTED_CHANNEL_MAP = {'background': 0, 'LV_A_S': 1, 'left_atrium': 2, 'LV_I_P': 3, 'LV_Pap': 4, 'LV_Cavity': 5}
+MRI_LAX_3CH_SEGMENTED_CHANNEL_MAP = {'background': 0, 'LV_anteroseptum': 1, 'left_atrium': 2, 'LV_inferior_wall': 3, 'LV_Papillary': 4, 'LV_Cavity': 5}
 MRI_LAX_4CH_SEGMENTED_CHANNEL_MAP = {'background': 0, 'RV_free_wall': 1, 'RA_free_wall': 2, 'LA_free_wall': 3, 'LV_anterolateral_wall': 4,
                                      'interventricular_septum': 5, 'interatrial_septum': 6, 'crista_terminalis': 7, 'RA_cavity': 8, 'RV_cavity': 9,
                                      'LA_cavity': 10, 'LV_cavity': 11, 'descending_aorta': 12, 'thoracic_cavity': 13}

diff --git a/ml4cvd/explorations.py b/ml4cvd/explorations.py
@@ -53,11 +53,13 @@ def sort_csv(input_csv_file, value_csv):
 def predictions_to_pngs(predictions: np.ndarray, tensor_maps_in: List[TensorMap], tensor_maps_out: List[TensorMap], data: Dict[str, np.ndarray],
                         labels: Dict[str, np.ndarray], paths: List[str], folder: str) -> None:
     input_map = tensor_maps_in[0]
+    if not os.path.exists(folder):
+        os.makedirs(folder)
     for y, tm in zip(predictions, tensor_maps_out):
         if not isinstance(predictions, list):  # When models have a single output model.predict returns a ndarray otherwise it returns a list
             y = predictions
         for im in tensor_maps_in:
-            if tm.is_categorical_any() and im.dependent_map == tm:
+            if tm.is_categorical() and im.dependent_map == tm:
                 input_map = im
             elif len(tm.shape) == len(im.shape):
                 input_map = im
@@ -80,7 +82,7 @@ def predictions_to_pngs(predictions: np.ndarray, tensor_maps_in: List[TensorMap]
         elif len(tm.shape) == 3:
             for i in range(y.shape[0]):
                 sample_id = os.path.basename(paths[i]).replace(TENSOR_EXT, '')
-                if tm.is_categorical_any():
+                if tm.is_categorical():
                     plt.imsave(f"{folder}{sample_id}_truth_{i:02d}{IMAGE_EXT}", np.argmax(labels[tm.output_name()][i], axis=-1, cmap='gray'))
                     plt.imsave(f"{folder}{sample_id}_prediction_{i:02d}{IMAGE_EXT}", np.argmax(y[i], axis=-1), cmap='gray')
                     if input_map is not None:
@@ -97,7 +99,7 @@ def predictions_to_pngs(predictions: np.ndarray, tensor_maps_in: List[TensorMap]
             for i in range(y.shape[0]):
                 sample_id = os.path.basename(paths[i]).replace(TENSOR_EXT, '')
                 for j in range(y.shape[3]):
-                    if tm.is_categorical_any():
+                    if tm.is_categorical():
                         truth = np.argmax(labels[tm.output_name()][i, :, :, j, :], axis=-1)
                         prediction = np.argmax(y[i, :, :, j, :], axis=-1)
                         true_donut = np.ma.masked_where(truth == 2, data[im.input_name()][i, :, :, j, 0])
@@ -134,12 +136,12 @@ def plot_while_learning(model, tensor_maps_in: List[TensorMap], tensor_maps_out:
                 vmin = np.min(mri_in)
                 vmax = np.max(mri_in)
                 logging.info(f"epoch:{i} write segmented mris y shape:{y.shape} label shape:{test_labels[tm.output_name()].shape} to folder:{folder}")
-                if tm.is_categorical_any() and len(tm.shape) == 3:
+                if tm.is_categorical() and len(tm.shape) == 3:
                     for yi in range(y.shape[0]):
                         plt.imsave(f"{folder}batch_{yi}_truth_epoch_{i:03d}{IMAGE_EXT}", np.argmax(test_labels[tm.output_name()][yi], axis=-1), cmap='gray')
                         plt.imsave(f"{folder}batch_{yi}_prediction_epoch_{i:03d}{IMAGE_EXT}", np.argmax(y[yi], axis=-1), cmap='gray')
                         plt.imsave(f"{folder}batch_{yi}_mri_epoch_{i:03d}{IMAGE_EXT}", mri_in[yi, :, :, 0], cmap='gray', vmin=vmin, vmax=vmax)
-                elif tm.is_categorical_any() and len(tm.shape) == 4:
+                elif tm.is_categorical() and len(tm.shape) == 4:
                     for yi in range(y.shape[0]):
                         for j in range(y.shape[3]):
                             truth = np.argmax(test_labels[tm.output_name()][yi, :, :, j, :], axis=-1)
@@ -341,7 +343,7 @@ def test_labels_to_label_map(test_labels: Dict[TensorMap, np.ndarray], examples:
             if tm.is_continuous():
                 label_dict[tm][i] = tm.rescale(test_labels[tm][i])
                 continuous_labels.append(tm)
-            elif tm.is_categorical_any():
+            elif tm.is_categorical():
                 label_dict[tm][i] = np.argmax(test_labels[tm][i])
                 categorical_labels.append(tm)
 
@@ -364,7 +366,7 @@ def infer_with_pixels(args):
         for ot, otm in zip(args.output_tensors, args.tensor_maps_out):
             if len(otm.shape) == 1 and otm.is_continuous():
                 header.extend([ot+'_prediction', ot+'_actual'])
-            elif len(otm.shape) == 1 and otm.is_categorical_any():
+            elif len(otm.shape) == 1 and otm.is_categorical():
                 channel_columns = []
                 for k in otm.channel_map:
                     channel_columns.append(ot + '_' + k + '_prediction')
@@ -395,7 +397,7 @@ def infer_with_pixels(args):
                         csv_row.append("NA")
                     else:
                         csv_row.append(str(tm.rescale(true_label[tm.output_name()])[0][0]))
-                elif len(tm.shape) == 1 and tm.is_categorical_any():
+                elif len(tm.shape) == 1 and tm.is_categorical():
                     for k in tm.channel_map:
                         csv_row.append(str(y[0][tm.channel_map[k]]))
                         csv_row.append(str(true_label[tm.output_name()][0][tm.channel_map[k]]))

diff --git a/ml4cvd/metrics.py b/ml4cvd/metrics.py
@@ -50,10 +50,6 @@ def angle_between_batches(tensors):
     return tf.acos(numerator / (l0*l1))
 
 
-def sum_pred_loss(y_true, y_pred):
-    return K.sum(y_pred, axis=-1)
-
-
 def two_batch_euclidean(tensors):
     return K.sqrt(K.sum(K.square(tensors[0] - tensors[1]), axis=-1, keepdims=True) + K.epsilon())
 
@@ -110,8 +106,20 @@ def ignore_sentinel_logcosh(y_true, y_pred):
     return ignore_sentinel_logcosh
 
 
-def sum_pred_loss(y_true, y_pred):
-    return K.sum(y_pred, axis=-1)
+def y_true_times_mse(y_true, y_pred):
+    return K.maximum(y_true, 1.0)*mean_squared_error(y_true, y_pred)
+
+
+def y_true_squared_times_mse(y_true, y_pred):
+    return K.maximum(1.0+y_true, 1.0)*K.maximum(1.0+y_true, 1.0)*mean_squared_error(y_true, y_pred)
+
+
+def y_true_cubed_times_mse(y_true, y_pred):
+    return K.maximum(y_true, 1.0)*K.maximum(y_true, 1.0)*K.maximum(y_true, 1.0)*mean_squared_error(y_true, y_pred)
+
+
+def y_true_squared_times_logcosh(y_true, y_pred):
+    return K.maximum(1.0+y_true, 1.0)*K.maximum(1.0+y_true, 1.0)*logcosh(y_true, y_pred)
 
 
 def two_batch_euclidean(tensors):

diff --git a/ml4cvd/models.py b/ml4cvd/models.py
@@ -1,11 +1,12 @@
 # models.py
+# This file defines model factories.
+# Model factories connect input TensorMaps to output TensorMaps with computational graphs.
 
 # Imports
 import os
 import h5py
 import time
 import logging
-import operator
 import numpy as np
 from collections import defaultdict
 from typing import Dict, List, Tuple, Iterable, Union, Optional
@@ -24,12 +25,12 @@
 from keras.layers import Conv1D, Conv2D, Conv3D, UpSampling1D, UpSampling2D, UpSampling3D, MaxPooling1D
 from keras.layers import MaxPooling2D, MaxPooling3D, AveragePooling1D, AveragePooling2D, AveragePooling3D, Layer
 
-from ml4cvd.TensorMap import TensorMap
 from ml4cvd.metrics import get_metric_dict
+from ml4cvd.optimizers import get_optimizer
 from ml4cvd.plots import plot_metric_history
+from ml4cvd.TensorMap import TensorMap, Interpretation
 from ml4cvd.defines import JOIN_CHAR, IMAGE_EXT, TENSOR_EXT, ECG_CHAR_2_IDX
-from ml4cvd.optimizers import get_optimizer
-from ml4cvd.lookahead import Lookahead
+
 
 CHANNEL_AXIS = -1  # Set to 1 for Theano backend
 
@@ -202,15 +203,15 @@ def make_character_model(tensor_maps_in: List[TensorMap], tensor_maps_out: List[
 
     input_layers = []
     for it in tensor_maps_in:
-        if it.is_hidden_layer():
+        if it.is_embedding():
             embed_in = Input(shape=it.shape, name=it.input_name())
             input_layers.append(embed_in)
-        elif it.is_ecg_text():
+        elif it.is_language():
             burn_in = Input(shape=it.shape, name=it.input_name())
             input_layers.append(burn_in)
             repeater = RepeatVector(it.shape[0])
         else:
-            logging.warning(f"character model cant handle {it.name} from group:{it.group}")
+            logging.warning(f"character model cant handle  input TensorMap:{it.name} with interpretation:{it.interpretation}")
 
     logging.info(f"inputs: {[il.name for il in input_layers]}")
     wave_embeds = repeater(embed_in)
@@ -634,7 +635,7 @@ def make_multimodal_multitask_model(tensor_maps_in: List[TensorMap] = None,
             dense_pool_layers = _pool_layers_from_kind_and_dimension(len(tm.shape), pool_type, len(dense_blocks), pool_x, pool_y, pool_z)
             last_conv = _dense_block(last_conv, layers, block_size, dense_conv_fxns, dense_pool_layers, len(tm.shape), activation, conv_normalize,
                                      conv_regularize, conv_dropout)
-            input_multimodal.append(Flatten()(last_conv))
+            input_multimodal.append(Flatten(name=f'embed_{tm.output_name()}')(last_conv))
         else:
             mlp_input = input_tensors[j]
             mlp = _dense_layer(mlp_input, layers, tm.annotation_units, activation, conv_normalize)
@@ -689,15 +690,15 @@ def make_multimodal_multitask_model(tensor_maps_in: List[TensorMap] = None,
                     last_conv = conv_layer(filters=all_filters[-(1 + i)], kernel_size=kernel, padding=padding)(last_conv)
 
             conv_label = conv_layer(tm.shape[channel_axis], _one_by_n_kernel(len(tm.shape)), activation="linear")(last_conv)
-            output_predictions[tm.output_name()] = Activation(tm.activation, name=tm.output_name())(conv_label)
+            output_predictions[tm] = Activation(tm.activation, name=tm.output_name())(conv_label)
         elif tm.parents is not None:
-            parented_activation = concatenate([multimodal_activation] + [output_predictions[p.output_name()] for p in tm.parents])
+            parented_activation = concatenate([multimodal_activation] + [output_predictions[p] for p in tm.parents])
             parented_activation = _dense_layer(parented_activation, layers, tm.annotation_units, activation, conv_normalize)
-            output_predictions[tm.output_name()] = Dense(units=tm.shape[0], activation=tm.activation, name=tm.output_name())(parented_activation)
-        elif tm.is_categorical_any():
-            output_predictions[tm.output_name()] = Dense(units=tm.shape[0], activation='softmax', name=tm.output_name())(multimodal_activation)
+            output_predictions[tm] = Dense(units=tm.shape[0], activation=tm.activation, name=tm.output_name())(parented_activation)
+        elif tm.is_categorical():
+            output_predictions[tm] = Dense(units=tm.shape[0], activation='softmax', name=tm.output_name())(multimodal_activation)
         else:
-            output_predictions[tm.output_name()] = Dense(units=tm.shape[0], activation=tm.activation, name=tm.output_name())(multimodal_activation)
+            output_predictions[tm] = Dense(units=tm.shape[0], activation=tm.activation, name=tm.output_name())(multimodal_activation)
 
     m = Model(inputs=input_tensors, outputs=list(output_predictions.values()))
     m.summary()
@@ -1129,13 +1130,12 @@ def _gradients_from_output(model, output_layer, output_index):
     return iterate
 
 
-def _get_tensor_maps_for_characters(tensor_maps_in: List[TensorMap], base_model: Model):
-    embed_model = make_hidden_layer_model(base_model, tensor_maps_in, 'embed')
-    tm_embed = TensorMap('embed', shape=(64,), group='hidden_layer', required_inputs=tensor_maps_in.copy(), model=embed_model)
-    tm_char = TensorMap('ecg_rest_next_char', shape=(len(ECG_CHAR_2_IDX),), channel_map=ECG_CHAR_2_IDX, activation='softmax', loss='categorical_crossentropy',
-                        loss_weight=1.0, cacheable=False)
-    tm_burn_in = TensorMap('ecg_rest_text', shape=(100, len(ECG_CHAR_2_IDX)), group='ecg_text', channel_map={'context': 0, 'alphabet': 1},
-                           dependent_map=tm_char, cacheable=False)
+def _get_tensor_maps_for_characters(tensor_maps_in: List[TensorMap], base_model: Model, embed_name='embed', embed_size=64, burn_in=100):
+    embed_model = make_hidden_layer_model(base_model, tensor_maps_in, embed_name)
+    tm_embed = TensorMap(embed_name, shape=(embed_size,), interpretation=Interpretation.EMBEDDING, parents=tensor_maps_in.copy(), model=embed_model)
+    tm_char = TensorMap('ecg_rest_next_char', shape=(len(ECG_CHAR_2_IDX),), Interpretation=Interpretation.LANGUAGE, channel_map=ECG_CHAR_2_IDX, cacheable=False)
+    tm_burn_in = TensorMap('ecg_rest_text', shape=(burn_in, len(ECG_CHAR_2_IDX)), Interpretation=Interpretation.LANGUAGE,
+                           channel_map={'context': 0, 'alphabet': 1}, dependent_map=tm_char, cacheable=False)
     return [tm_embed, tm_burn_in], [tm_char]
 
 

diff --git a/ml4cvd/plots.py b/ml4cvd/plots.py
@@ -78,37 +78,37 @@ def evaluate_predictions(tm: TensorMap, y_predictions: np.ndarray, y_truth: np.n
     :return: Dictionary of performance metrics with string keys for labels and float values
     """
     performance_metrics = {}
-    if tm.is_categorical_any() and len(tm.shape) == 1:
+    if tm.is_categorical() and tm.axes() == 1:
         logging.info(f"For tm:{tm.name} with channel map:{tm.channel_map} examples:{y_predictions.shape[0]}")
         logging.info(f"\nSum Truth:{np.sum(y_truth, axis=0)} \nSum pred :{np.sum(y_predictions, axis=0)}")
         plot_precision_recall_per_class(y_predictions, y_truth, tm.channel_map, title, folder)
         performance_metrics.update(plot_roc_per_class(y_predictions, y_truth, tm.channel_map, title, folder))
         rocs.append((y_predictions, y_truth, tm.channel_map))
-    elif tm.is_categorical() and len(tm.shape) == 2:
+    elif tm.is_categorical() and tm.axes() == 2:
         melt_shape = (y_predictions.shape[0] * y_predictions.shape[1], y_predictions.shape[2])
         idx = np.random.choice(np.arange(melt_shape[0]), max_melt, replace=False)
         y_predictions = y_predictions.reshape(melt_shape)[idx]
         y_truth = y_truth.reshape(melt_shape)[idx]
         performance_metrics.update(plot_roc_per_class(y_predictions, y_truth, tm.channel_map, title, folder))
         performance_metrics.update(plot_precision_recall_per_class(y_predictions, y_truth, tm.channel_map, title, folder))
         rocs.append((y_predictions, y_truth, tm.channel_map))
-    elif tm.is_categorical() and len(tm.shape) == 3:
+    elif tm.is_categorical() and tm.axes() == 3:
         melt_shape = (y_predictions.shape[0] * y_predictions.shape[1] * y_predictions.shape[2], y_predictions.shape[3])
         idx = np.random.choice(np.arange(melt_shape[0]), max_melt, replace=False)
         y_predictions = y_predictions.reshape(melt_shape)[idx]
         y_truth = y_truth.reshape(melt_shape)[idx]
         performance_metrics.update(plot_roc_per_class(y_predictions, y_truth, tm.channel_map, title, folder))
         performance_metrics.update(plot_precision_recall_per_class(y_predictions, y_truth, tm.channel_map, title, folder))
         rocs.append((y_predictions, y_truth, tm.channel_map))
-    elif tm.is_categorical_any() and len(tm.shape) == 4:
+    elif tm.is_categorical() and tm.axes() == 4:
         melt_shape = (y_predictions.shape[0] * y_predictions.shape[1] * y_predictions.shape[2] * y_predictions.shape[3], y_predictions.shape[4])
         idx = np.random.choice(np.arange(melt_shape[0]), max_melt, replace=False)
         y_predictions = y_predictions.reshape(melt_shape)[idx]
         y_truth = y_truth.reshape(melt_shape)[idx]
         performance_metrics.update(plot_roc_per_class(y_predictions, y_truth, tm.channel_map, title, folder))
         performance_metrics.update(plot_precision_recall_per_class(y_predictions, y_truth, tm.channel_map, title, folder))
         rocs.append((y_predictions, y_truth, tm.channel_map))
-    elif tm.is_proportional_hazard():
+    elif tm.is_cox_proportional_hazard():
         plot_survival(y_predictions, y_truth, title, prefix=folder)
         plot_survival_curves(y_predictions, y_truth, title, prefix=folder, paths=test_paths)
     elif len(tm.shape) > 1:
@@ -171,7 +171,7 @@ def plot_metric_history(history, title, prefix='./figures/'):
 
 def plot_scatter(prediction, truth, title, prefix='./figures/', paths=None, top_k=3, alpha=0.5):
     margin = float((np.max(truth)-np.min(truth))/100)
-    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(SUBPLOT_SIZE, 2 * SUBPLOT_SIZE), sharex='all')
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(SUBPLOT_SIZE, 2 * SUBPLOT_SIZE))
     ax1.plot([np.min(truth), np.max(truth)], [np.min(truth), np.max(truth)], linewidth=2)
     ax1.plot([np.min(prediction), np.max(prediction)], [np.min(prediction), np.max(prediction)], linewidth=4)
     pearson = np.corrcoef(prediction.flatten(), truth.flatten())[1, 0]  # corrcoef returns full covariance matrix
@@ -190,7 +190,7 @@ def plot_scatter(prediction, truth, title, prefix='./figures/', paths=None, top_
     ax1.set_xlabel('Predictions')
     ax1.set_ylabel('Actual')
     ax1.set_title(title + '\n')
-    ax1.legend(loc="upper left")
+    ax1.legend(loc="lower right")
 
     sns.distplot(prediction, label='Predicted', color='r', ax=ax2)
     sns.distplot(truth, label='Truth', color='b', ax=ax2)