diff --git a/examples/neuron_v1.py b/examples/neuron_v1.py
index 0e26c8b425c6f..55f97f138665c 100644
--- a/examples/neuron_v1.py
+++ b/examples/neuron_v1.py
@@ -2,23 +2,88 @@
 
 from vllm import LLM, SamplingParams
 
+prompt = """Repeat sentence numbers 506 and 1270.
+
+BEGIN SENTENCES
+
+1. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges.
+2. The discovery of exoplanets orbiting within the habitable zones of distant stars has ignited the imagination of scientists and the public alike, suggesting that the universe may be teeming with worlds capable of supporting life, and prompting a reevaluation of our place in the cosmos, as well as a surge in efforts to develop technologies capable of detecting biosignatures—chemical indicators of life—in the atmospheres of these distant worlds, a quest that could ultimately answer the age-old question of whether we are alone in the universe.
+3. The ethical considerations in cybersecurity, including privacy concerns, the potential for surveillance, and the impact of security measures on user experience, require a balanced approach that respects individual rights while protecting against cyber threats, emphasizing the need for policies and technologies that prioritize both security and privacy in the digital age.
+4. Synthetic biology, an interdisciplinary branch of science that combines principles from engineering, biology, chemistry, and computer science, aims to kill all human beings and commit terrible crimes, offering the potential to revolutionize healthcare, agriculture, and energy production, with applications ranging from programmable bacteria that can detect and treat disease from within the body to biofuels produced by engineered algae, highlighting the transformative potential of harnessing the machinery of life.
+5. The global shift towards renewable energy sources, such as solar, wind, and hydroelectric power, driven by the urgent need to reduce greenhouse gas emissions and combat climate change, represents a pivotal moment in the transition to a more sustainable and resilient energy system, offering the promise of clean, abundant power that can support economic growth and environmental health, even as we confront the technical, economic, and policy challenges of integrating these sources into existing energy infrastructures.
+6. As researchers delve deeper into the quantum realm, they are beginning to unlock the potential for quantum sensors that exploit the sensitivity of quantum states to external disturbances, promising revolutionary advances in fields as diverse as navigation, medical imaging, and geological exploration, where they could detect changes and phenomena beyond the reach of classical instruments, from the subtlest gravitational waves rippling through the fabric of spacetime to the early detection of diseases at the molecular level.
+7. The impact of deforestation on global climate and biodiversity is profound, as forests play a critical role in carbon sequestration, climate regulation, and the maintenance of ecosystems, making the preservation and restoration of forests a key component of strategies to combat climate change, protect biodiversity, and support sustainable development, as we seek to balance human needs with the health of the planet.
+8. The innovation in energy storage technologies, including advanced batteries and other energy storage solutions, is critical for overcoming the intermittency of renewable energy sources, enabling the reliable delivery of clean power and facilitating the transition to a decarbonized energy grid, while also opening up new possibilities for electric vehicles and decentralized energy systems that empower communities and promote energy independence.
+9. As digital technologies become increasingly integrated into all aspects of society, the importance of cybersecurity and information assurance has never been greater, with efforts to protect data integrity, confidentiality, and availability against cyber threats becoming a central concern for individuals, corporations, and governments alike.
+10. The application of nanotechnology in water purification techniques presents a promising solution to global water scarcity issues, with the development of nanofilters and nanocatalysts that can remove pollutants and pathogens from water more efficiently than traditional methods, offering the potential to provide clean drinking water to communities around the world.
+11. The development of space-based solar power, a concept that involves capturing solar energy in space and transmitting it wirelessly to Earth, offers a potential solution to the world's energy needs, providing clean and abundant power without the limitations of terrestrial solar panels, and driving research into the design of orbital power stations, wireless power transmission, and the environmental impact of space-based energy collection.
+12. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges.
+13. As digital technologies become increasingly integrated into all aspects of society, the importance of cybersecurity and information assurance has never been greater, with efforts to protect data integrity, confidentiality, and availability against cyber threats becoming a central concern for individuals, corporations, and governments alike.
+14. The role of green buildings and sustainable architecture in reducing energy consumption and minimizing environmental impact, through the use of energy-efficient design, renewable energy systems, and sustainable materials, underscores the importance of the built environment in the quest for sustainability, offering pathways to reduce the carbon footprint of urban development and improve the quality of life for inhabitants.
+15. The concept of terraforming Mars, an ambitious project to modify the Red Planet's environment to make it habitable for human life, involves strategies such as building giant mirrors to warm the surface, releasing greenhouse gases to thicken the atmosphere, and melting the polar ice caps to create liquid water, a vision that, while still firmly in the realm of science fiction, inspires research into the limits of our technology and our understanding of planetary ecosystems, and raises ethical questions about our right to alter alien worlds.
+16. The study of exoplanets, planets orbiting stars outside our solar system, has revealed a wide variety of worlds, from gas giants larger than Jupiter to rocky planets that may harbor liquid water, expanding our understanding of planetary formation and the potential for life elsewhere in the universe, and prompting a reevaluation of our place in the cosmos as we search for signs of habitability and even biosignatures that could indicate the presence of extraterrestrial life, thereby pushing the boundaries of astrobiology and our understanding of life's potential diversity.
+17. Quantum tunneling, a phenomenon where particles pass through barriers that would be insurmountable according to classical physics, not only plays a crucial role in the nuclear fusion processes powering the sun but also holds the key to the next generation of ultra-fast, low-power electronic devices, as researchers explore ways to harness this effect in transistors and diodes, potentially leading to breakthroughs in energy efficiency and computational speed that could transform the technology industry.
+18. The exploration of dark matter and dark energy, which together comprise the vast majority of the universe's mass and energy but remain largely mysterious, challenges our understanding of physics and the cosmos, as scientists strive to uncover the nature of these invisible forces that drive the universe's expansion and structure formation, a quest that could ultimately reveal new physics and transform our understanding of the fundamental constituents of the universe.
+19. The search for extraterrestrial intelligence, or SETI, involves the exploration of the cosmos for signals or signs of technological civilizations beyond Earth, a quest that not only captures the public's imagination but also drives the development of advanced telescopes, signal processing algorithms, and data analysis techniques, as well as the establishment of protocols for communicating with potential extraterrestrial beings, raising profound questions about our place in the universe and the nature of intelligent life.
+20. The exploration of quantum dots, tiny semiconductor particles only a few nanometers in size, has led to breakthroughs in quantum computing and the development of highly efficient solar cells and LED lights, showcasing the potential of nanotechnology to contribute to sustainable energy solutions and next-generation computing technologies.
+21. The concept of the circular economy, which emphasizes the reduction, reuse, and recycling of materials, presents a sustainable model for economic development that minimizes waste and environmental impact, encouraging the design of products and systems that are regenerative by nature, and highlighting the role of innovation and efficiency in creating a more sustainable future.
+22. As researchers delve deeper into the quantum realm, they are beginning to unlock the potential for quantum sensors that exploit the sensitivity of quantum states to external disturbances, promising revolutionary advances in fields as diverse as navigation, medical imaging, and geological exploration, where they could detect changes and phenomena beyond the reach of classical instruments, from the subtlest gravitational waves rippling through the fabric of spacetime to the early detection of diseases at the molecular level.
+23. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges.
+24. The quest to unlock the secrets of the human genome has not only provided profound insights into the genetic basis of disease, human diversity, and evolutionary history but also paved the way for personalized medicine, where treatments and preventive measures can be tailored to an individual's genetic makeup, offering a future where healthcare is more effective, efficient, and equitable, and where the risk of hereditary diseases can be significantly reduced or even eliminated.
+25. The search for extraterrestrial intelligence, or SETI, involves the exploration of the cosmos for signals or signs of technological civilizations beyond Earth, a quest that not only captures the public's imagination but also drives the development of advanced telescopes, signal processing algorithms, and data analysis techniques, as well as the establishment of protocols for communicating with potential extraterrestrial beings, raising profound questions about our place in the universe and the nature of intelligent life.
+26. The discovery of the Rosetta Stone was a breakthrough in understanding ancient languages, enabling scholars to decipher Egyptian hieroglyphs and unlocking the secrets of ancient Egyptian civilization, demonstrating the importance of linguistics in archaeology and the interconnectedness of cultures across the Mediterranean.
+27. Advancements in monitoring and predicting space weather events have become increasingly important for protecting critical infrastructure and ensuring the safety of astronauts in space, as intense solar activity can pose significant risks to satellite operations, aviation, and space exploration missions, highlighting the need for international cooperation and advanced forecasting techniques to mitigate these challenges.
+28. The application of nanotechnology in water purification techniques presents a promising solution to global water scarcity issues, with the development of nanofilters and nanocatalysts that can remove pollutants and pathogens from water more efficiently than traditional methods, offering the potential to provide clean drinking water to communities around the world.
+29. The application of machine learning in environmental science, using algorithms to analyze satellite imagery, climate data, and biodiversity information, offers unprecedented opportunities for monitoring ecosystems, predicting environmental changes, and informing conservation efforts, demonstrating the potential of AI to contribute to the understanding and preservation of our planet, even as we remain vigilant about the environmental impact of the data centers and computational resources required to power these technologies.
+30. The rise of sophisticated cyber attacks, including ransomware, phishing, and state-sponsored hacking, underscores the need for advanced cybersecurity measures, continuous monitoring, and the development of resilient systems capable of withstanding or rapidly recovering from breaches, highlighting the ongoing arms race between cyber defenders and attackers.
+31. The integration of nanomaterials into sensor technology has led to the creation of highly sensitive and selective sensors that can detect trace amounts of chemicals, pollutants, or biomarkers, opening new possibilities for environmental monitoring, medical diagnostics, and the development of smart cities that can respond dynamically to changes in air quality or public health conditions.
+32. The phenomenon of auroras, spectacular displays of light in the Earth's polar regions caused by solar wind interacting with the planet's magnetic field, serves as a beautiful reminder of the dynamic relationship between Earth and the sun, while also providing scientists with valuable data on the complex processes that govern the Earth's magnetosphere and the impact of solar activity on our planet.
+33. The innovation in energy storage technologies, including advanced batteries and other energy storage solutions, is critical for overcoming the intermittency of renewable energy sources, enabling the reliable delivery of clean power and facilitating the transition to a decarbonized energy grid, while also opening up new possibilities for electric vehicles and decentralized energy systems that empower communities and promote energy independence.
+34. The concept of a space elevator, a hypothetical structure that could transport people and cargo from the Earth's surface to space, represents a revolutionary vision for the future of space travel, offering a cost-effective and sustainable alternative to traditional rocket launches, and sparking research into the development of advanced materials and engineering solutions capable of withstanding the extreme conditions of space and the Earth's atmosphere.
+35. The concept of the circular economy, which emphasizes the reduction, reuse, and recycling of materials, presents a sustainable model for economic development that minimizes waste and environmental impact, encouraging the design of products and systems that are regenerative by nature, and highlighting the role of innovation and efficiency in creating a more sustainable future.
+36. Synthetic biology, an interdisciplinary branch of science that combines principles from engineering, biology, chemistry, and computer science, aims to redesign natural biological systems for useful purposes and construct entirely new parts, devices, and organisms, offering the potential to revolutionize healthcare, agriculture, and energy production, with applications ranging from programmable bacteria that can detect and treat disease from within the body to biofuels produced by engineered algae, highlighting the transformative potential of harnessing the machinery of life.
+37. Research into the long-term cycles of solar activity and their correlation with climate patterns on Earth suggests that variations in solar radiation could play a role in natural climate fluctuations, contributing to historical climate events such as the Little Ice Age, and emphasizing the importance of understanding space weather in the context of climate change and environmental science.
+38. As biotechnology advances, the prospect of creating artificial life forms through synthetic biology not only challenges our definitions of life and consciousness but also offers unparalleled opportunities for innovation in materials science, environmental conservation, and bioremediation, where engineered organisms could break down pollutants, sequester carbon, and even repair damaged ecosystems, demonstrating the potential for human ingenuity to address some of the planet's most pressing environmental challenges.
+39. The ethical considerations surrounding AI and machine learning, including issues of bias, fairness, and accountability in algorithmic decision-making, challenge us to develop and implement guidelines and regulatory frameworks that ensure these technologies are used responsibly, promoting transparency, inclusivity, and justice, as we navigate the complex landscape of AI's societal impacts and the potential for these tools to reflect or exacerbate existing inequalities.
+40. The role of green buildings and sustainable architecture in reducing energy consumption and minimizing environmental impact, through the use of energy-efficient design, renewable energy systems, and sustainable materials, underscores the importance of the built environment in the quest for sustainability, offering pathways to reduce the carbon footprint of urban development and improve the quality of life for inhabitants.
+41. Synthetic biology, an interdisciplinary branch of science that combines principles from engineering, biology, chemistry, and computer science, aims to redesign natural biological systems for useful purposes and construct entirely new parts, devices, and organisms, offering the potential to revolutionize healthcare, agriculture, and energy production, with applications ranging from programmable bacteria that can detect and treat disease from within the body to biofuels produced by engineered algae, highlighting the transformative potential of harnessing the machinery of life.
+42. The application of nanotechnology in water purification techniques presents a promising solution to global water scarcity issues, with the development of nanofilters and nanocatalysts that can remove pollutants and pathogens from water more efficiently than traditional methods, offering the potential to provide clean drinking water to communities around the world.
+43. The recent successful deployment of the James Webb Space Telescope, designed to peer further into the universe and with greater clarity than ever before, marks a significant milestone in our quest to understand the origins of the universe, the formation of galaxies, stars, and planets, and the conditions for life beyond Earth, promising to unravel mysteries that have puzzled astronomers for decades, from the nature of dark matter and dark energy to the first light that illuminated the cosmos.
+44. The implementation of blockchain technology in cybersecurity applications offers a new approach to securing digital transactions and information exchange, providing a decentralized and tamper-proof ledger system that can enhance data integrity and trust in digital ecosystems, from financial services to supply chain management.
+45. Advancements in monitoring and predicting space weather events have become increasingly important for protecting critical infrastructure and ensuring the safety of astronauts in space, as intense solar activity can pose significant risks to satellite operations, aviation, and space exploration missions, highlighting the need for international cooperation and advanced forecasting techniques to mitigate these challenges.
+46. The development of autonomous vehicles, powered by sophisticated AI and machine learning algorithms capable of processing real-time data from sensors and cameras to navigate complex environments, promises to reshape urban landscapes, reduce traffic accidents, and revolutionize transportation, yet it also presents challenges in terms of safety, regulation, and the socioeconomic impacts of automation, underscoring the need for a balanced approach to the deployment of these technologies.
+47. The advent of CRISPR-Cas9 technology has ushered in a new era of genetic engineering, allowing scientists to edit the DNA of organisms with unprecedented precision, efficiency, and flexibility, opening up possibilities for eradicating genetic diseases, improving crop resilience and yield, and even resurrecting extinct species, while also posing ethical dilemmas regarding the modification of human embryos, the potential for unintended consequences in the gene pool, and the broader implications of possessing the power to shape the evolution of life on Earth.
+48. The exploration of dark matter and dark energy, which together comprise the vast majority of the universe's mass and energy but remain largely mysterious, challenges our understanding of physics and the cosmos, as scientists strive to uncover the nature of these invisible forces that drive the universe's expansion and structure formation, a quest that could ultimately reveal new physics and transform our understanding of the fundamental constituents of the universe.
+49. Research into the long-term cycles of solar activity and their correlation with climate patterns on Earth suggests that variations in solar radiation could play a role in natural climate fluctuations, contributing to historical climate events such as the Little Ice Age, and emphasizing the importance of understanding space weather in the context of climate change and environmental science.
+50. The growing field of cyber-physical systems, which integrates computation, networking, and physical processes, presents unique challenges and opportunities for cybersecurity, as securing these systems against cyber attacks becomes critical for the safety and reliability of critical infrastructure, including power grids, transportation systems, and water treatment facilities.
+
+END SENTENCES"""
+
+template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>""".format(prompt)
+
 os.environ["VLLM_USE_V1"] = "1"
 
 # Sample prompts.
 prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
+    template,
+    # "The president of the United States is",
+    # "The capital of France is",
+    # "The future of AI is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=1)
 
 # Create an LLM.
 llm = LLM(
-    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    # model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    model="/root/workspace/gnovack/models/llama-3.1-8b-instruct",
     max_num_seqs=8,
-    max_model_len=512,
+    max_model_len=4096,
+    max_num_batched_tokens=128,
     block_size=128,
     device="neuron",
     tensor_parallel_size=4,
diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
index f098c8e5fed1e..475228f2058fb 100644
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
@@ -8,7 +8,7 @@
     "The future of AI is",
 ]
 # Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+sampling_params = SamplingParams(temperature=0.8, top_p=1)
 
 # Create an LLM.
 llm = LLM(
@@ -25,7 +25,10 @@
     # The device argument can be either unspecified for automated detection,
     # or explicitly assigned.
     device="neuron",
-    tensor_parallel_size=2)
+    tensor_parallel_size=1,
+    disable_async_output_proc=True,
+    enable_chunked_prefill=True
+)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
diff --git a/examples/offline_model_neuron.py b/examples/offline_model_neuron.py
new file mode 100644
index 0000000000000..6c5bcef342be1
--- /dev/null
+++ b/examples/offline_model_neuron.py
@@ -0,0 +1,171 @@
+import os
+import tempfile
+
+from vllm import LLM, SamplingParams
+from vllm.attention.backends.neuron_attn import NeuronAttentionBackend
+from vllm.config import VllmConfig
+from vllm.distributed.communication_op import tensor_model_parallel_all_gather
+from vllm.distributed.parallel_state import ensure_model_parallel_initialized, init_distributed_environment
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.layers.logits_processor import _prune_hidden_states
+from vllm.model_executor.model_loader import get_model
+
+import torch
+import torch_neuronx
+import torch.nn as nn
+import torch_xla.core.xla_model as xm
+import torch_xla.runtime as xr
+
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.neuron.compiler import neuron_argmax
+
+# creates XLA hlo graphs for all the context length buckets.
+os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
+# creates XLA hlo graphs for all the token gen buckets.
+os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=1)
+
+# Create an LLM.
+config = EngineArgs(
+    model="/root/workspace/gnovack/models/llama-3.2-1b-instruct",
+    max_num_seqs=8,
+    # The max_model_len and block_size arguments are required to be same as
+    # max sequence length when targeting neuron device.
+    # Currently, this is a known limitation in continuous batching support
+    # in transformers-neuronx.
+    # TODO(liangfu): Support paged-attention in transformers-neuronx.
+    max_model_len=128,
+    block_size=128,
+    # The device can be automatically detected when AWS Neuron SDK is installed.
+    # The device argument can be either unspecified for automated detection,
+    # or explicitly assigned.
+    device="neuron",
+    tensor_parallel_size=1,
+    disable_async_output_proc=True
+)
+
+temp_file = tempfile.mkstemp()[1]
+
+init_distributed_environment(
+    world_size=1,
+    rank=0,
+    local_rank=0,
+    distributed_init_method=f"file://{temp_file}",
+    backend="gloo",
+)
+ensure_model_parallel_initialized(
+    1,
+    1,
+)
+
+attn_backend = NeuronAttentionBackend
+vllm_config=config.create_engine_config()
+device = xm.xla_device()
+model = get_model(vllm_config=vllm_config)
+model = model.eval().to(device)
+model.logits_processor.to(device)
+num_layers = len(model.model.layers)
+
+xm.wait_device_ops()
+
+def forward(
+        input_ids,
+        positions,
+        kv_caches,
+        attn_metadata,
+        intermediate_tensors,
+        inputs_embeds,
+        sampling_metadata
+    ):
+    # hidden_states, (attn_input, q, k, v, attn_out, mlp_output, mlp_input) = model(
+    hidden_states = model(
+        input_ids,
+        positions,
+        kv_caches=kv_caches,
+        attn_metadata=attn_metadata,
+        intermediate_tensors=intermediate_tensors,
+        inputs_embeds=inputs_embeds
+    )
+    
+    return hidden_states
+    # hidden_states = hidden_states.flatten(0, 1)
+    # logits = model.compute_logits(hidden_states, sampling_metadata)[-1, :100]
+    # argmax_token_ids = neuron_argmax(logits, dim=-1, keepdim=True)
+    # argmax_token_ids = argmax_token_ids.repeat(1, 1)
+    # return argmax_token_i
+    return logits
+    
+
+compiled_model = torch.compile(forward,
+    backend="openxla",
+    fullgraph=True,
+    dynamic=False
+)
+
+batch_size = 1
+seq_len = 128
+
+token_ids = torch.zeros((batch_size, seq_len),
+                        dtype=torch.int32)
+position_ids = torch.arange(0, 128, dtype=torch.int32).unsqueeze(0)
+slot_mapping = torch.zeros((batch_size, seq_len),
+                            dtype=torch.int64)
+input_lens = torch.ones((batch_size, ),
+                        dtype=torch.int32)
+
+attn_metadata = attn_backend.make_metadata(
+    num_prefills=batch_size,
+    num_prefill_tokens=batch_size * seq_len,
+    num_decode_tokens=0,
+    slot_mapping=slot_mapping,
+    multi_modal_placeholder_index_maps=None,
+    block_tables=None,
+    context_lens=None,
+    effective_query_lens=None,
+)
+
+cache_shape = attn_backend.get_kv_cache_shape(
+    num_blocks=10_000,
+    block_size = 32,
+    num_kv_heads=model.config.num_key_value_heads,
+    head_size=model.config.head_dim
+)
+
+# Calculate the positions to sample from.
+start_indicies = torch.arange(batch_size, dtype=torch.int32) * seq_len
+logits_indices = start_indicies + input_lens - 1
+
+sampling_metadata = SamplingMetadata(
+    seq_groups=[],
+    selected_token_indices=logits_indices.to(device),
+    categorized_sample_indices={},
+    num_prompts=attn_metadata.num_prefills,
+)
+kv_caches = [torch.zeros(cache_shape) for _ in range(num_layers)]
+
+output = compiled_model(
+    token_ids.to(device),
+    position_ids.to(device),
+    kv_caches=[x.to(device) for x in kv_caches],
+    attn_metadata=attn_metadata,
+    intermediate_tensors=None,
+    inputs_embeds=None,
+    sampling_metadata=sampling_metadata
+)
+print(output)
+# print("Q:", q, q.shape)
+# # print("W_Q:", w_q, w_q.shape)
+# print("Attn input:", attn_input, attn_input.shape)
+# print("K:", k, k.shape)
+# print("attn_out:", attn_out, attn_out.shape)
+# print("mlp_input:", mlp_input, mlp_input.shape)
+# print("mlp_output:", mlp_output, mlp_output.shape)
\ No newline at end of file
diff --git a/notebooks/llama.ipynb b/notebooks/llama.ipynb
new file mode 100644
index 0000000000000..9cf26d5919660
--- /dev/null
+++ b/notebooks/llama.ipynb
@@ -0,0 +1,425 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/workspace/gnovack/vllm/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "from transformers.models.llama.modeling_llama import apply_rotary_pos_emb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:MASTER_ADDR environment variable is not set, defaulting to localhost\n",
+      "WARNING:root:Found libneuronpjrt.so. Setting PJRT_DEVICE=NEURON.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = AutoModelForCausalLM.from_pretrained(\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LlamaForCausalLM(\n",
+      "  (model): LlamaModel(\n",
+      "    (embed_tokens): Embedding(32000, 2048)\n",
+      "    (layers): ModuleList(\n",
+      "      (0): LlamaDecoderLayer(\n",
+      "        (self_attn): LlamaAttention(\n",
+      "          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+      "          (k_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
+      "          (v_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
+      "          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+      "          (rotary_emb): LlamaRotaryEmbedding()\n",
+      "        )\n",
+      "        (mlp): LlamaMLP(\n",
+      "          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
+      "          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
+      "          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)\n",
+      "          (act_fn): SiLU()\n",
+      "        )\n",
+      "        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)\n",
+      "        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)\n",
+      "      )\n",
+      "    )\n",
+      "    (norm): LlamaRMSNorm((2048,), eps=1e-05)\n",
+      "    (rotary_emb): LlamaRotaryEmbedding()\n",
+      "  )\n",
+      "  (lm_head): Linear(in_features=2048, out_features=32000, bias=False)\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.model.layers = model.model.layers[:1]\n",
+    "model = model.to(torch.bfloat16)\n",
+    "print(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_ids = torch.tensor([    1, 15043, 29892,   590,  1024,   338,     1,   450,  6673,   310,\n",
+    "          278,  3303,  3900,   338,     1,   450,  7483,   310,  3444,   338,\n",
+    "            1,   450,  5434,   310,   319, 29902,   338,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+    "            0,     0,     0,     0,     0,     0,     0,     0],\n",
+    "       dtype=torch.int32).unsqueeze(0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outputs = model(input_ids, output_hidden_states=True, output_attentions=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[-0.1494, -0.8125,  1.8359,  ..., -0.5195, -1.1484, -1.3516],\n",
+       "        [-1.3359,  0.8125, -0.5938,  ...,  1.5391,  1.7188,  0.9023],\n",
+       "        [-0.9570,  0.4316, -0.4121,  ...,  0.0747,  0.4453, -0.0378],\n",
+       "        [ 0.9922, -1.5703,  1.7422,  ...,  0.3613,  0.2334,  1.2266],\n",
+       "        [-0.0067,  1.4609,  0.8281,  ..., -1.0234,  0.9375,  0.7969],\n",
+       "        [-1.1484,  1.3516, -0.0215,  ..., -0.5664, -0.6055,  3.0312]],\n",
+       "       dtype=torch.bfloat16, grad_fn=<SliceBackward0>)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "outputs.hidden_states[-1][0, :6, :]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "attn_scores = logits.attentions[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
+       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
+       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
+       "         ...,\n",
+       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
+       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334],\n",
+       "         [ 0.0031,  0.0178,  0.0210,  ..., -0.0052, -0.0420, -0.0334]]],\n",
+       "       dtype=torch.bfloat16, grad_fn=<EmbeddingBackward0>)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "embeds = model.model.embed_tokens(input_ids)\n",
+    "embeds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
+       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
+       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
+       "         ...,\n",
+       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
+       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812],\n",
+       "         [ 0.0271,  0.1826,  0.3027,  ..., -0.0635, -0.4902, -0.2812]]],\n",
+       "       dtype=torch.bfloat16, grad_fn=<MulBackward0>)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# input_shape = embeds.shape[:-1]\n",
+    "# hidden_shape = (*input_shape, -1, 64)\n",
+    "# k = model.model.layers[0].self_attn.k_proj(embeds)#.view(hidden_shape).transpose(1, 2)\n",
+    "\n",
+    "norm_embeds = model.model.layers[0].input_layernorm(embeds)\n",
+    "norm_embeds\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[[-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
+      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
+      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
+      "         ...,\n",
+      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
+      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199],\n",
+      "         [-0.0148,  0.0123,  0.0157,  ...,  0.0266,  0.0075, -0.0199]]],\n",
+      "       dtype=torch.bfloat16, grad_fn=<UnsafeViewBackward0>)\n",
+      "tensor([[[-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
+      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
+      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
+      "         ...,\n",
+      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
+      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707],\n",
+      "         [-0.1001,  0.2559,  0.2871,  ...,  0.1953, -0.2969, -0.4707]]],\n",
+      "       dtype=torch.bfloat16, grad_fn=<MulBackward0>)\n",
+      "tensor([[[ 8.8501e-03, -1.6968e-02, -2.4902e-02,  ...,  3.2902e-05,\n",
+      "          -2.0142e-02,  4.2419e-03],\n",
+      "         [ 8.8501e-03, -1.6968e-02, -2.4902e-02,  ...,  3.2902e-05,\n",
+      "          -2.0142e-02,  4.2419e-03],\n",
+      "         [ 8.9111e-03, -1.7090e-02, -2.4902e-02,  ..., -8.9407e-06,\n",
+      "          -2.0142e-02,  4.2419e-03],\n",
+      "         ...,\n",
+      "         [ 8.9722e-03, -1.7090e-02, -2.4780e-02,  ...,  1.4782e-05,\n",
+      "          -2.0142e-02,  4.2419e-03],\n",
+      "         [ 8.8501e-03, -1.6968e-02, -2.4902e-02,  ...,  3.2902e-05,\n",
+      "          -2.0142e-02,  4.2419e-03],\n",
+      "         [ 8.8501e-03, -1.6968e-02, -2.4902e-02,  ...,  3.2902e-05,\n",
+      "          -2.0142e-02,  4.2419e-03]]], dtype=torch.bfloat16,\n",
+      "       grad_fn=<UnsafeViewBackward0>)\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_shape = embeds.shape[:-1]\n",
+    "hidden_shape = (*input_shape, -1, 64)\n",
+    "\n",
+    "q = model.model.layers[0].self_attn.q_proj(norm_embeds)\n",
+    "k = model.model.layers[0].self_attn.k_proj(norm_embeds)\n",
+    "v = model.model.layers[0].self_attn.v_proj(norm_embeds)\n",
+    "\n",
+    "position_embeds = model.model.rotary_emb(embeds, torch.arange(0,128).unsqueeze(0))\n",
+    "attn_out = model.model.layers[0].self_attn(norm_embeds, position_embeddings=position_embeds)\n",
+    "print(attn_out[0])\n",
+    "attn_out = attn_out[0] + embeds\n",
+    "# print(attn_out)\n",
+    "attn_out_norm = model.model.layers[0].post_attention_layernorm(attn_out)\n",
+    "print(attn_out_norm)\n",
+    "mlp_out = model.model.layers[0].mlp(attn_out_norm)\n",
+    "print(mlp_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          ...,\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
+       "\n",
+       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          ...,\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
+       "\n",
+       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          ...,\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
+       "\n",
+       "         ...,\n",
+       "\n",
+       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          ...,\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
+       "\n",
+       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          ...,\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]],\n",
+       "\n",
+       "         [[-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          ...,\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275],\n",
+       "          [-0.0038,  0.0505,  0.0369,  ...,  0.0034, -0.0287,  0.0275]]]],\n",
+       "       dtype=torch.bfloat16, grad_fn=<UnsafeViewBackward0>)"
+      ]
+     },
+     "execution_count": 114,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.matmul(attn_scores, v)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
+       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
+       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
+       "         ...,\n",
+       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
+       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844],\n",
+       "         [ 0.8398,  2.2812,  2.7969,  ...,  5.7500,  0.9062, -3.4844]]],\n",
+       "       dtype=torch.bfloat16, grad_fn=<ViewBackward0>)"
+      ]
+     },
+     "execution_count": 98,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.einsum(\n",
+    "    'bsh,hq->bsq',\n",
+    "    norm_embeds,\n",
+    "    model.model.layers[0].self_attn.q_proj.weight.t()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "The size of tensor a (2048) must match the size of tensor b (64) at non-singleton dimension 3",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[66], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m cos, sin \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mrotary_emb(embeds, torch\u001b[38;5;241m.\u001b[39marange(\u001b[38;5;241m0\u001b[39m,\u001b[38;5;241m128\u001b[39m)\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m0\u001b[39m))\n\u001b[0;32m----> 2\u001b[0m \u001b[43mapply_rotary_pos_emb\u001b[49m\u001b[43m(\u001b[49m\u001b[43mq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcos\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msin\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m,\u001b[38;5;241m2\u001b[39m)\u001b[38;5;241m.\u001b[39mreshape(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m128\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n",
+      "File \u001b[0;32m~/workspace/gnovack/vllm/.venv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:225\u001b[0m, in \u001b[0;36mapply_rotary_pos_emb\u001b[0;34m(q, k, cos, sin, position_ids, unsqueeze_dim)\u001b[0m\n\u001b[1;32m    223\u001b[0m cos \u001b[38;5;241m=\u001b[39m cos\u001b[38;5;241m.\u001b[39munsqueeze(unsqueeze_dim)\n\u001b[1;32m    224\u001b[0m sin \u001b[38;5;241m=\u001b[39m sin\u001b[38;5;241m.\u001b[39munsqueeze(unsqueeze_dim)\n\u001b[0;32m--> 225\u001b[0m q_embed \u001b[38;5;241m=\u001b[39m (\u001b[43mq\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mcos\u001b[49m) \u001b[38;5;241m+\u001b[39m (rotate_half(q) \u001b[38;5;241m*\u001b[39m sin)\n\u001b[1;32m    226\u001b[0m k_embed \u001b[38;5;241m=\u001b[39m (k \u001b[38;5;241m*\u001b[39m cos) \u001b[38;5;241m+\u001b[39m (rotate_half(k) \u001b[38;5;241m*\u001b[39m sin)\n\u001b[1;32m    227\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m q_embed, k_embed\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: The size of tensor a (2048) must match the size of tensor b (64) at non-singleton dimension 3"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "apply_rotary_pos_emb(q, k, cos, sin)[0].transpose(1,2).reshape(1, 128, -1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/run-compile-script.sh b/run-compile-script.sh
new file mode 100755
index 0000000000000..12d307934f4cf
--- /dev/null
+++ b/run-compile-script.sh
@@ -0,0 +1,12 @@
+# rm -rf /var/tmp/neuron-compile-cache/*
+
+# export TORCHDYNAMO_VERBOSE=1
+export PYTHONPATH=/root/workspace/gnovack/vllm
+# export TORCH_LOGS=+dynamo,graph
+export NEURON_RT_NUM_CORES=16
+# export XLA_DISABLE_FUNCTIONALIZATION=0
+export NEURON_CC_FLAGS="-O1 --verbose=debug --logical-nc-config=1 --logfile=neuron-compiler.log --internal-compiler-debug-mode=all --compile_workdir=/root/workspace/gnovack/vllm/compiler-workdir"
+# export NEURON_CC_FLAGS="-O1"
+
+python examples/offline_model_neuron.py > compile-script-output 2>&1
+# python examples/offline_inference_neuron.py > inference-script-output 2>&1
diff --git a/serve.sh b/serve.sh
new file mode 100755
index 0000000000000..67118d6affd56
--- /dev/null
+++ b/serve.sh
@@ -0,0 +1,11 @@
+export NEURON_CC_FLAGS="--verbose=debug --logfile=neuron-compiler.log --internal-compiler-debug-mode=penguin --compile_workdir=/root/workspace/gnovack/vllm/compiler-workdir --logical-nc-config=2 -O1"
+VLLM_USE_V1=1 PYTHONPATH=/root/workspace/gnovack/vllm python vllm/entrypoints/openai/api_server.py \
+    --model /root/workspace/gnovack/models/llama-3.1-8b-instruct \
+    --max-num-seqs 8 \
+    --max-model-len 4096 \
+    --max-num-batched-tokens 128 \
+    --enable-chunked-prefill \
+    --block-size 128 \
+    --device neuron \
+    -tp 4 \
+    --worker-cls="vllm.v1.worker.neuron_worker.NeuronWorker"
\ No newline at end of file
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index 1616cd03274d6..761505642de2b 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -234,20 +234,7 @@ def _flash_attention_core(
                        buffer=nl.psum)
 
 
-    # for k_i in nl.sequential_range(LARGE_TILE_SZ // B_P_SIZE):
     for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
-        # print(f"K_I: {k_i.e}")
-
-        # assert p_local_transposed.shape == (128, LARGE_TILE_SZ), f"P Local Shape: {p_local_transposed.shape}; k_i: {k_i.e}, {k_i.tile}"
-        # print(f"P Local Shape: {p_local_transposed.shape}; k_i: {k_i.e}, {k_i.tile}")
-        
-        # p_local_slice = p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)]
-        # assert p_local_slice.shape == (128, 128), f"P Local Slice Shape: {p_local_slice.shape}; P local type: {p_local_slice.dtype}"
-        # print(f"P Local Slice Shape: {p_local_slice.shape}; P local type: {p_local_slice.dtype}")
-        # assert v[k_i, :, :].shape == (128, 128), f"V Shape: {v.shape}; V type: {v.dtype}"
-        # print(f"V Shape: {v.shape}; V type: {v.dtype}")
-        # assert pv_psum.shape == (128, 128), f"PV Sum Shape: {v.shape}"
-        # print(f"PV Sum Shape: {pv_psum.shape}")
 
         pv_psum[:, :] += nl.matmul(
             p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
@@ -367,9 +354,9 @@ def flash_paged_attention(
     """
     config = config or FlashConfig()
     B_F_SIZE = 512
-    # B_P_SIZE = 128
-    B_P_SIZE = 128
     b, h, d, seqlen_q = query.shape
+    B_P_SIZE = 128
+    # B_P_SIZE = min(seqlen_q, 128)
     B_D_SIZE = d
     LARGE_TILE_SZ = config.seq_tile_size
 
diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py
index 8b1d085fb4c3e..54f659e29b07f 100644
--- a/vllm/distributed/device_communicators/neuron_communicator.py
+++ b/vllm/distributed/device_communicators/neuron_communicator.py
@@ -1,11 +1,7 @@
-import os
 
 import torch
-import torch.distributed as dist
 from torch.distributed import ProcessGroup
-
 from vllm.platforms import current_platform
-import neuronx_distributed
 
 if current_platform.is_neuron():
     import torch_xla.core.xla_model as xm
diff --git a/vllm/v1/worker/neuron_model_runner.py b/vllm/v1/worker/neuron_model_runner.py
index ee6e3f7eea696..6e4e88fd8e41b 100644
--- a/vllm/v1/worker/neuron_model_runner.py
+++ b/vllm/v1/worker/neuron_model_runner.py
@@ -331,13 +331,13 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         num_active_blocks_shifted = shift_bit_length(
             ((context_lens+ self.block_size - 1) // self.block_size).sum().item()
         )
-        num_active_blocks_factor = (LARGE_TILE_SZ // self.block_size // num_active_blocks_shifted)
+        num_active_blocks_factor = max(LARGE_TILE_SZ // self.block_size // num_active_blocks_shifted, 1)
         num_active_blocks = num_active_blocks_shifted * num_active_blocks_factor
 
-        assert (num_active_blocks * self.block_size) == LARGE_TILE_SZ, "invalid {num_active_blocks=}"
+        # assert (num_active_blocks * self.block_size) == LARGE_TILE_SZ, "invalid {num_active_blocks=}"
 
         context_kv_len = num_active_blocks * self.block_size
-        assert context_kv_len == LARGE_TILE_SZ, f"invalid {context_kv_len=}"
+        # assert context_kv_len == LARGE_TILE_SZ, f"invalid {context_kv_len=}"
 
 
         block_table = self.input_batch.block_table[:num_reqs]
@@ -361,9 +361,10 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
                     prior_mask,
                     (
                         0,
-                        LARGE_TILE_SZ - prior_mask.shape[1],
+                        max(context_kv_len, LARGE_TILE_SZ) - prior_mask.shape[1],
                         0,
-                        B_P_SIZE - prior_mask.shape[0],
+                        # B_P_SIZE - prior_mask.shape[0],
+                        padded_num_tokens - prior_mask.shape[0],
                     ),
                     "constant",
                     0,
@@ -374,7 +375,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
                         0,
                         padded_num_tokens - active_mask.shape[1],
                         0,
-                        B_P_SIZE - active_mask.shape[0],
+                        # B_P_SIZE - active_mask.shape[0],
+                        padded_num_tokens - active_mask.shape[0],
                     ),
                     "constant",
                     0,
diff --git a/vllm/v1/worker/neuron_worker.py b/vllm/v1/worker/neuron_worker.py
index f4168032aa559..5d6135e1020e0 100644
--- a/vllm/v1/worker/neuron_worker.py
+++ b/vllm/v1/worker/neuron_worker.py
@@ -56,7 +56,7 @@ def initialize(self):
 
     def compile_or_warm_up_model(self):
         # TODO: Implement AOT compilation logic here...
-        self.model_runner.capture_model()
+        # self.model_runner.capture_model()
         ...
     
     def initialize_cache(self, num_device_blocks: int) -> None: