diff --git a/evals/metrics/__init__.py b/evals/metrics/__init__.py
new file mode 100644
index 00000000..2d5825c6
--- /dev/null
+++ b/evals/metrics/__init__.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/evals/metrics/ragas/__init__.py b/evals/metrics/ragas/__init__.py
new file mode 100644
index 00000000..f2f02578
--- /dev/null
+++ b/evals/metrics/ragas/__init__.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+#
+
+from .ragas import (
+    RAGASContextualPrecisionMetric,
+    RAGASContextualRelevancyMetric,
+    RAGASAnswerRelevancyMetric,
+    RAGASFaithfulnessMetric,
+    RAGASContextualRecallMetric,
+    RagasMetric,
+)
diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
new file mode 100644
index 00000000..d437c5cf
--- /dev/null
+++ b/evals/metrics/ragas/ragas.py
@@ -0,0 +1,390 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+#
+
+from typing import Dict, Optional, Union
+
+from langchain_core.embeddings import Embeddings
+from langchain_core.language_models import BaseLanguageModel
+from langchain_huggingface import HuggingFaceEndpoint
+
+
+def format_ragas_metric_name(name: str):
+    return f"{name} (ragas)"
+
+
+class RAGASContextualPrecisionMetric:
+    """This metric checks the contextual precision using Ragas."""
+
+    def __init__(
+        self,
+        threshold: float = 0.3,
+        model: Optional[Union[str, BaseLanguageModel]] = "gpt-3.5-turbo",
+    ):
+        self.threshold = threshold
+        self.model = model
+
+    def measure(self, test_case: Dict):
+        try:
+            from ragas import evaluate
+            from ragas.llms import LangchainLLMWrapper
+            from ragas.metrics import context_precision
+
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
+
+        try:
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+
+        # Set LLM model
+        if isinstance(self.model, str):
+            chat_model = HuggingFaceEndpoint(
+                endpoint_url=self.model,
+                timeout=600,
+            )
+        else:
+            chat_model = self.model
+        chat_model = LangchainLLMWrapper(chat_model)
+        # Create a dataset from the test case
+        data = {
+            "contexts": [test_case["retrieval_context"]],
+            "question": [test_case["input"]],
+            "ground_truth": [test_case["expected_output"]],
+        }
+        dataset = Dataset.from_dict(data)
+
+        # Evaluate the dataset using Ragas
+        scores = evaluate(dataset, metrics=[context_precision], llm=chat_model)
+        # Ragas only does dataset-level comparisons
+        context_precision_score = scores["context_precision"]
+        self.success = context_precision_score >= self.threshold
+        self.score = context_precision_score
+        return self.score
+
+    async def a_measure(self, test_case: Dict):
+        return self.measure(test_case)
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return format_ragas_metric_name("Contextual Precision")
+
+
+class RAGASContextualRelevancyMetric:
+    """This metric checks the contextual relevancy using Ragas."""
+
+    def __init__(
+        self,
+        threshold: float = 0.3,
+        model: Optional[Union[str, BaseLanguageModel]] = "gpt-3.5-turbo",
+    ):
+        self.threshold = threshold
+        self.model = model
+
+    async def a_measure(self, test_case: Dict):
+        return self.measure(test_case)
+
+    def measure(self, test_case: Dict):
+        # sends to server
+        try:
+            from ragas import evaluate
+            from ragas.llms import LangchainLLMWrapper
+            from ragas.metrics import context_relevancy
+
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
+
+        try:
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+
+        # Set LLM model
+        if isinstance(self.model, str):
+            chat_model = HuggingFaceEndpoint(
+                endpoint_url=self.model,
+                timeout=600,
+            )
+        else:
+            chat_model = self.model
+        chat_model = LangchainLLMWrapper(chat_model)
+        # Create a dataset from the test case
+        data = {
+            "contexts": [test_case["retrieval_context"]],
+            "question": [test_case["input"]],
+        }
+        dataset = Dataset.from_dict(data)
+
+        # Evaluate the dataset using Ragas
+        scores = evaluate(dataset, metrics=[context_relevancy], llm=chat_model)
+
+        # Ragas only does dataset-level comparisons
+        context_relevancy_score = scores["context_relevancy"]
+        self.success = context_relevancy_score >= self.threshold
+        self.score = context_relevancy_score
+        return self.score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return format_ragas_metric_name("Contextual Relevancy")
+
+
+class RAGASAnswerRelevancyMetric:
+    """This metric checks the answer relevancy using Ragas."""
+
+    def __init__(
+        self,
+        threshold: float = 0.3,
+        model: Optional[Union[str, BaseLanguageModel]] = "gpt-3.5-turbo",
+        embeddings: Optional[Embeddings] = None,
+    ):
+
+        self.threshold = threshold
+        self.model = model
+        self.embeddings = embeddings
+
+    async def a_measure(self, test_case: Dict):
+        return self.measure(test_case)
+
+    def measure(self, test_case: Dict):
+        # sends to server
+        try:
+            from ragas import evaluate
+            from ragas.llms import LangchainLLMWrapper
+            from ragas.metrics import answer_relevancy
+
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
+
+        try:
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+
+        # Set LLM model
+        if isinstance(self.model, str):
+            chat_model = HuggingFaceEndpoint(
+                endpoint_url=self.model,
+                timeout=600,
+            )
+        else:
+            chat_model = self.model
+        chat_model = LangchainLLMWrapper(chat_model)
+        data = {
+            "question": [test_case["input"]],
+            "answer": [test_case["actual_output"]],
+            "contexts": [test_case["retrieval_context"]],
+        }
+        dataset = Dataset.from_dict(data)
+
+        scores = evaluate(
+            dataset,
+            metrics=[answer_relevancy],
+            llm=chat_model,
+            embeddings=self.embeddings,
+        )
+        answer_relevancy_score = scores["answer_relevancy"]
+        self.success = answer_relevancy_score >= self.threshold
+        self.score = answer_relevancy_score
+        return self.score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return format_ragas_metric_name("Answer Relevancy")
+
+
+class RAGASFaithfulnessMetric:
+    def __init__(
+        self,
+        threshold: float = 0.3,
+        model: Optional[Union[str, BaseLanguageModel]] = "gpt-3.5-turbo",
+    ):
+
+        self.threshold = threshold
+        self.model = model
+
+    async def a_measure(self, test_case: Dict):
+        return self.measure(test_case)
+
+    def measure(self, test_case: Dict):
+        # sends to server
+        try:
+            from ragas import evaluate
+            from ragas.llms import LangchainLLMWrapper
+            from ragas.metrics import faithfulness
+
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
+
+        try:
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+
+        # Set LLM model
+        if isinstance(self.model, str):
+            chat_model = HuggingFaceEndpoint(
+                endpoint_url=self.model,
+                timeout=600,
+            )
+        else:
+            chat_model = self.model
+        chat_model = LangchainLLMWrapper(chat_model)
+        data = {
+            "contexts": [test_case["retrieval_context"]],
+            "question": [test_case["input"]],
+            "answer": [test_case["actual_output"]],
+        }
+        dataset = Dataset.from_dict(data)
+
+        scores = evaluate(
+            dataset,
+            metrics=[faithfulness],
+            llm=chat_model,
+        )
+        faithfulness_score = scores["faithfulness"]
+        self.success = faithfulness_score >= self.threshold
+        self.score = faithfulness_score
+        return self.score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return format_ragas_metric_name("Faithfulness")
+
+
+class RAGASContextualRecallMetric:
+    """This metric checks the context recall using Ragas."""
+
+    def __init__(
+        self,
+        threshold: float = 0.3,
+        model: Optional[Union[str, BaseLanguageModel]] = "gpt-3.5-turbo",
+    ):
+        self.threshold = threshold
+        self.model = model
+
+    async def a_measure(self, test_case: Dict):
+        return self.measure(test_case)
+
+    def measure(self, test_case: Dict):
+        # sends to server
+        try:
+            from ragas import evaluate
+            from ragas.llms import LangchainLLMWrapper
+            from ragas.metrics import context_recall
+
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
+
+        try:
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+
+        # Set LLM model
+        if isinstance(self.model, str):
+            chat_model = HuggingFaceEndpoint(
+                endpoint_url=self.model,
+                timeout=600,
+            )
+        else:
+            chat_model = self.model
+        chat_model = LangchainLLMWrapper(chat_model)
+        data = {
+            "question": [test_case["input"]],
+            "ground_truth": [test_case["expected_output"]],
+            "contexts": [test_case["retrieval_context"]],
+        }
+        dataset = Dataset.from_dict(data)
+
+        scores = evaluate(
+            dataset,
+            [context_recall],
+            llm=chat_model,
+        )
+        context_recall_score = scores["context_recall"]
+        self.success = context_recall_score >= self.threshold
+        self.score = context_recall_score
+        return self.score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return format_ragas_metric_name("Contextual Recall")
+
+
+class RagasMetric:
+    """This metric checks if the output is more than 3 letters."""
+
+    def __init__(
+        self,
+        threshold: float = 0.3,
+        model: Optional[Union[str, BaseLanguageModel]] = "gpt-3.5-turbo",
+        embeddings: Optional[Embeddings] = None,
+    ):
+
+        self.threshold = threshold
+        self.model = model
+        self.embeddings = embeddings
+
+    async def a_measure(self, test_case: Dict):
+        return self.measure(test_case)
+
+    def measure(self, test_case: Dict):
+        # sends to server
+        try:
+            from ragas import evaluate
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
+
+        try:
+            # How do i make sure this isn't just huggingface dataset
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+
+        # Create a dataset from the test case
+        # Convert the Dict to a format compatible with Dataset
+        score_breakdown = {}
+        metrics = [
+            RAGASContextualPrecisionMetric(model=self.model),
+            RAGASContextualRecallMetric(model=self.model),
+            RAGASFaithfulnessMetric(model=self.model),
+            RAGASAnswerRelevancyMetric(model=self.model, embeddings=self.embeddings),
+        ]
+
+        for metric in metrics:
+            score = metric.measure(test_case)
+            score_breakdown[metric.__name__] = score
+
+        ragas_score = sum(score_breakdown.values()) / len(score_breakdown)
+
+        self.success = ragas_score >= self.threshold
+        self.score = ragas_score
+        self.score_breakdown = score_breakdown
+        return self.score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return "RAGAS"
diff --git a/requirements.txt b/requirements.txt
index cc3859dd..483cf00a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,5 @@
 bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e
+langchain_community
+langchain_huggingface
 lm-eval==0.4.2
+ragas
diff --git a/tests/requirements.txt b/tests/requirements.txt
index cc3859dd..483cf00a 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,2 +1,5 @@
 bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e
+langchain_community
+langchain_huggingface
 lm-eval==0.4.2
+ragas
diff --git a/tests/test_ragas.py b/tests/test_ragas.py
new file mode 100644
index 00000000..66d30964
--- /dev/null
+++ b/tests/test_ragas.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+#
+import unittest
+
+from evals.metrics.ragas import RagasMetric
+
+
+class TestRagasMetric(unittest.TestCase):
+
+    @unittest.skip("need assign localhost id")
+    def test_ragas(self):
+        # Replace this with the actual output from your LLM application
+        actual_output = "We offer a 30-day full refund at no extra cost."
+
+        # Replace this with the expected output from your RAG generator
+        expected_output = "You are eligible for a 30 day full refund at no extra cost."
+
+        # Replace this with the actual retrieved context from your RAG pipeline
+        retrieval_context = ["All customers are eligible for a 30 day full refund at no extra cost."]
+        from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+
+        embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+        metric = RagasMetric(threshold=0.5, model="http://localhost:8008", embeddings=embeddings)
+        test_case = {
+            "input": "What if these shoes don't fit?",
+            "actual_output": actual_output,
+            "expected_output": expected_output,
+            "retrieval_context": retrieval_context,
+        }
+
+        metric.measure(test_case)
+        print(metric.score)
+
+
+if __name__ == "__main__":
+    unittest.main()