diff --git a/models/ggml_to_pt.py b/models/ggml_to_pt.py
new file mode 100644
index 00000000000..0997fbf81e4
--- /dev/null
+++ b/models/ggml_to_pt.py
@@ -0,0 +1,109 @@
+import struct
+import torch
+import numpy as np
+from collections import OrderedDict
+from pathlib import Path
+import sys
+
+if len(sys.argv) < 3:
+    print(
+        "Usage: convert-ggml-to-pt.py model.bin dir-output\n")
+    sys.exit(1)
+
+fname_inp = Path(sys.argv[1])
+dir_out = Path(sys.argv[2])
+fname_out = dir_out / "torch-model.pt"
+
+
+
+# Open the ggml file
+with open(fname_inp, "rb") as f:
+    # Read magic number and hyperparameters
+    magic_number, n_vocab, n_audio_ctx, n_audio_state, n_audio_head, n_audio_layer, n_text_ctx, n_text_state, n_text_head, n_text_layer, n_mels, use_f16 = struct.unpack("12i", f.read(48))
+    print(f"Magic number: {magic_number}")
+    print(f"Vocab size: {n_vocab}")
+    print(f"Audio context size: {n_audio_ctx}")
+    print(f"Audio state size: {n_audio_state}")
+    print(f"Audio head size: {n_audio_head}")
+    print(f"Audio layer size: {n_audio_layer}")
+    print(f"Text context size: {n_text_ctx}")
+    print(f"Text head size: {n_text_head}")
+    print(f"Mel size: {n_mels}")
+    # Read mel filters
+    # mel_filters = np.fromfile(f, dtype=np.float32, count=n_mels * 2).reshape(n_mels, 2)
+    # print(f"Mel filters: {mel_filters}")
+    filters_shape_0 = struct.unpack("i", f.read(4))[0]
+    print(f"Filters shape 0: {filters_shape_0}")
+    filters_shape_1 = struct.unpack("i", f.read(4))[0]
+    print(f"Filters shape 1: {filters_shape_1}")
+
+    # Read tokenizer tokens
+    # bytes = f.read(4)
+    # print(bytes)
+    
+
+    # for i in range(filters.shape[0]):
+    # for j in range(filters.shape[1]):
+    #     fout.write(struct.pack("f", filters[i][j]))
+    mel_filters = np.zeros((filters_shape_0, filters_shape_1))
+
+    for i in range(filters_shape_0):
+        for j in range(filters_shape_1):
+            mel_filters[i][j] = struct.unpack("f", f.read(4))[0]
+            
+    bytes_data = f.read(4) 
+    num_tokens = struct.unpack("i", bytes_data)[0]
+    tokens = {}
+
+
+    for _ in range(num_tokens):
+        token_len = struct.unpack("i", f.read(4))[0]
+        token = f.read(token_len)
+        tokens[token] = {}
+    
+    # Read model variables
+    model_state_dict = OrderedDict()
+    while True:
+        try:
+            n_dims, name_length, ftype = struct.unpack("iii", f.read(12))
+        except struct.error:
+            break  # End of file
+        dims = [struct.unpack("i", f.read(4))[0] for _ in range(n_dims)]
+        dims = dims[::-1]
+        name = f.read(name_length).decode("utf-8")
+        if ftype == 1:  # f16
+            data = np.fromfile(f, dtype=np.float16, count=np.prod(dims)).reshape(dims)
+        else:  # f32
+            data = np.fromfile(f, dtype=np.float32, count=np.prod(dims)).reshape(dims)
+
+            
+        if name in  ["encoder.conv1.bias", "encoder.conv2.bias"]:
+            
+            data = data[:, 0]
+        
+            
+        model_state_dict[name] = torch.from_numpy(data)
+    
+# Now you have the model's state_dict stored in model_state_dict
+# You can load this state_dict into a model with the same architecture
+
+# dims = ModelDimensions(**checkpoint["dims"])
+# model = Whisper(dims)
+from whisper import Whisper, ModelDimensions
+dims = ModelDimensions(
+    n_mels=n_mels,
+    n_audio_ctx=n_audio_ctx,
+    n_audio_state=n_audio_state,
+    n_audio_head=n_audio_head,
+    n_audio_layer=n_audio_layer,
+    n_text_ctx=n_text_ctx,
+    n_text_state=n_text_state,
+    n_text_head=n_text_head,
+    n_text_layer=n_text_layer,
+    n_vocab=n_vocab,
+)
+model = Whisper(dims)  # Replace with your model's class
+model.load_state_dict(model_state_dict)
+
+# Save the model in PyTorch format
+torch.save(model.state_dict(), fname_out)