deep-spin · CoderPat · Nov 4, 2024 · Nov 20, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/conda_install.sh b/conda_install.sh
@@ -29,13 +29,14 @@ echo "Megatron-LM dir: $DIR"
 
 source ${CONDA_HOME}/etc/profile.d/conda.sh
 # python can't handle this dependency madness, switch to C++
-conda create -y -n ${ENV_NAME} python=3.10
+# conda create -y -n ${ENV_NAME} python=3.10
 conda activate ${ENV_NAME}
 
 
 pip install ninja
 
 # install our own copy of CUDA and set environment variables
+conda install -y openldap
 conda install -y -c "nvidia/label/cuda-12.4.0" cuda-toolkit cuda-nvcc cudnn
 
 export PATH=${CONDA_ENVS}/${ENV_NAME}/bin:$PATH

diff --git a/examples/multimodal/combine_lm_vision_checkpoints.sh b/examples/multimodal/combine_lm_vision_checkpoints.sh
@@ -4,9 +4,11 @@ MCORE_VISION=$2   # <path_to_mcore_vision_model_folder>
 OUTPUT_DIR=$3   # <path_to_output_folder_for_combined_checkpoint>
 MODEL_TYPE=$4   # Model type. Default: Mistral CLIP example.
 
+SCRIPT_DIR=$(dirname $(readlink -f $0))
+
 if [[ $MODEL_TYPE == "nvlm" ]]; then
     # NVLM TP=8
-    python examples/multimodal/combine_state_dicts.py \
+    python ${SCRIPT_DIR}/combine_state_dicts.py \
         --input \
         ${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
         ${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
@@ -36,7 +38,7 @@ if [[ $MODEL_TYPE == "nvlm" ]]; then
         ${OUTPUT_DIR}/iter_0000001/mp_rank_07/model_optim_rng.pt
 else
     # Mistral CLIP example TP=4.
-    python examples/multimodal/combine_state_dicts.py \
+    python ${SCRIPT_DIR}/combine_state_dicts.py \
         --input \
         ${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
         ${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \

diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py
@@ -166,7 +166,7 @@ def get_vision_model_config(config, apply_query_key_layer_scaling):
 def get_vision_projection_config(config, hidden_size):
     config.gated_linear_unit = False
     config.bias_activation_fusion = False
-    config.add_bias_linear = False
+    config.add_bias_linear = True # This was changed to make it compatible with HF's LLava
     config.hidden_size = hidden_size  # Used as the vision projection output size, i.e., the input to the language model.
     if config.language_model_type == "2b":
         config.ffn_hidden_size = 5440
@@ -178,7 +178,8 @@ def get_vision_projection_config(config, hidden_size):
         config.ffn_hidden_size = 14336
         config.activation_func = torch.nn.functional.gelu
     elif config.language_model_type == "mistral_7b":
-        config.ffn_hidden_size = 14336
+        # TODO: check what needs to be done for other models
+        config.ffn_hidden_size = hidden_size # This was changed to make it compatible with HF's LLava
         config.activation_func = torch.nn.functional.gelu
     elif config.language_model_type == "yi-34b":
         config.ffn_hidden_size = 20480