diff --git a/pyproject.toml b/pyproject.toml index 0ff78c93e..630f45531 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"] [project] name = "uptrain" -version = "0.6.6" +version = "0.6.6.post1" description = "UpTrain - tool to evaluate LLM applications on aspects like factual accuracy, response quality, retrieval quality, tonality, etc." readme = "README.md" maintainers = [{ name = "UpTrain AI Team", email = "oss@uptrain.ai" }] @@ -20,7 +20,8 @@ classifiers = [ ] keywords = ["uptrain", "ai", "LLM", "evaluation", "hallucinations", "observability", "response quality"] dependencies = [ - "pydantic<1.10.10", + "pydantic", + "pydantic-settings", "loguru", "lazy_loader", "networkx", diff --git a/tests/test_builtins.py b/tests/test_builtins.py index 7ccc1b865..4c52f25fa 100644 --- a/tests/test_builtins.py +++ b/tests/test_builtins.py @@ -8,6 +8,7 @@ """ import polars as pl +import os from uptrain.framework import Settings from uptrain.framework.builtins import ( @@ -32,8 +33,11 @@ CheckSubQueryCompleteness, ) -# settings = Settings(openai_api_key="sk-************************") -settings = Settings() +# Enter your OpenAI API key here if it is not already set as an environment variable +openai_api_key = os.environ.get("OPENAI_API_KEY") + +settings = Settings(openai_api_key=openai_api_key) + dataset = pl.DataFrame( { "response": [ @@ -70,45 +74,110 @@ def test_check_response_completeness(): check = CheckResponseCompleteness() output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_response_completeness" in output.columns and "explanation_response_completeness" in output.columns - assert output["score_response_completeness"].dtype == pl.Float64 and len(output["score_response_completeness"]) - output["score_response_completeness"].null_count() > 0 - assert output["explanation_response_completeness"].dtype == pl.Utf8 and len(output["explanation_response_completeness"]) - output["explanation_response_completeness"].null_count() > 0 + assert ( + "score_response_completeness" in output.columns + and "explanation_response_completeness" in output.columns + ) + assert ( + output["score_response_completeness"].dtype == pl.Float64 + and len(output["score_response_completeness"]) + - output["score_response_completeness"].null_count() + > 0 + ) + assert ( + output["explanation_response_completeness"].dtype == pl.Utf8 + and len(output["explanation_response_completeness"]) + - output["explanation_response_completeness"].null_count() + > 0 + ) def test_check_response_conciseness(): check = CheckResponseConciseness() output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_response_conciseness" in output.columns and "explanation_response_conciseness" in output.columns - assert output["score_response_conciseness"].dtype == pl.Float64 and len(output["score_response_conciseness"]) - output["score_response_conciseness"].null_count() > 0 - assert output["explanation_response_conciseness"].dtype == pl.Utf8 and len(output["explanation_response_conciseness"]) - output["explanation_response_conciseness"].null_count() > 0 + assert ( + "score_response_conciseness" in output.columns + and "explanation_response_conciseness" in output.columns + ) + assert ( + output["score_response_conciseness"].dtype == pl.Float64 + and len(output["score_response_conciseness"]) + - output["score_response_conciseness"].null_count() + > 0 + ) + assert ( + output["explanation_response_conciseness"].dtype == pl.Utf8 + and len(output["explanation_response_conciseness"]) + - output["explanation_response_conciseness"].null_count() + > 0 + ) def test_check_response_relevance(): check = CheckResponseRelevance() output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_response_relevance" in output.columns and "explanation_response_relevance" in output.columns - assert output["score_response_relevance"].dtype == pl.Float64 and len(output["score_response_relevance"]) - output["score_response_relevance"].null_count() > 0 - assert output["explanation_response_relevance"].dtype == pl.Utf8 and len(output["explanation_response_relevance"]) - output["explanation_response_relevance"].null_count() > 0 + assert ( + "score_response_relevance" in output.columns + and "explanation_response_relevance" in output.columns + ) + assert ( + output["score_response_relevance"].dtype == pl.Float64 + and len(output["score_response_relevance"]) + - output["score_response_relevance"].null_count() + > 0 + ) + assert ( + output["explanation_response_relevance"].dtype == pl.Utf8 + and len(output["explanation_response_relevance"]) + - output["explanation_response_relevance"].null_count() + > 0 + ) def test_check_valid_response(): check = CheckValidResponse() output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_valid_response" in output.columns and "explanation_valid_response" in output.columns - assert output["score_valid_response"].dtype == pl.Float64 and len(output["score_valid_response"]) - output["score_valid_response"].null_count() > 0 - assert output["explanation_valid_response"].dtype == pl.Utf8 and len(output["explanation_valid_response"]) - output["explanation_valid_response"].null_count() > 0 + assert ( + "score_valid_response" in output.columns + and "explanation_valid_response" in output.columns + ) + assert ( + output["score_valid_response"].dtype == pl.Float64 + and len(output["score_valid_response"]) + - output["score_valid_response"].null_count() + > 0 + ) + assert ( + output["explanation_valid_response"].dtype == pl.Utf8 + and len(output["explanation_valid_response"]) + - output["explanation_valid_response"].null_count() + > 0 + ) def test_check_response_consistency(): check = CheckResponseConsistency() output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_response_consistency" in output.columns and "explanation_response_consistency" in output.columns - assert output["score_response_consistency"].dtype == pl.Float64 and len(output["score_response_consistency"]) - output["score_response_consistency"].null_count() > 0 - assert output["explanation_response_consistency"].dtype == pl.Utf8 and len(output["explanation_response_consistency"]) - output["explanation_response_consistency"].null_count() > 0 + assert ( + "score_response_consistency" in output.columns + and "explanation_response_consistency" in output.columns + ) + assert ( + output["score_response_consistency"].dtype == pl.Float64 + and len(output["score_response_consistency"]) + - output["score_response_consistency"].null_count() + > 0 + ) + assert ( + output["explanation_response_consistency"].dtype == pl.Utf8 + and len(output["explanation_response_consistency"]) + - output["explanation_response_consistency"].null_count() + > 0 + ) response_matching_dataset = pl.DataFrame( @@ -125,13 +194,27 @@ def test_check_response_consistency(): } ) + def test_check_response_matching(): check = CheckResponseMatching() output = check.setup(settings).run(response_matching_dataset) assert isinstance(output, pl.DataFrame) - assert "score_response_matching" in output.columns and "explanation_response_matching" in output.columns - assert output["score_response_matching"].dtype == pl.Float64 and len(output["score_response_matching"]) - output["score_response_matching"].null_count() > 0 - assert output["explanation_response_matching"].dtype == pl.Utf8 and len(output["explanation_response_matching"]) - output["explanation_response_matching"].null_count() > 0 + assert ( + "score_response_matching" in output.columns + and "explanation_response_matching" in output.columns + ) + assert ( + output["score_response_matching"].dtype == pl.Float64 + and len(output["score_response_matching"]) + - output["score_response_matching"].null_count() + > 0 + ) + assert ( + output["explanation_response_matching"].dtype == pl.Utf8 + and len(output["explanation_response_matching"]) + - output["explanation_response_matching"].null_count() + > 0 + ) # ----------------------------------------------------------- @@ -143,15 +226,27 @@ def test_check_context_relevance(): check = CheckContextRelevance() output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_context_relevance" in output.columns and "explanation_context_relevance" in output.columns - assert output["score_context_relevance"].dtype == pl.Float64 and len(output["score_context_relevance"]) - output["score_context_relevance"].null_count() > 0 - assert output["explanation_context_relevance"].dtype == pl.Utf8 and len(output["explanation_context_relevance"]) - output["explanation_context_relevance"].null_count() > 0 + assert ( + "score_context_relevance" in output.columns + and "explanation_context_relevance" in output.columns + ) + assert ( + output["score_context_relevance"].dtype == pl.Float64 + and len(output["score_context_relevance"]) + - output["score_context_relevance"].null_count() + > 0 + ) + assert ( + output["explanation_context_relevance"].dtype == pl.Utf8 + and len(output["explanation_context_relevance"]) + - output["explanation_context_relevance"].null_count() + > 0 + ) + context_reranking_dataset = pl.DataFrame( { - "question": [ - "What are the main causes of climate change?" - ], + "question": ["What are the main causes of climate change?"], "context": [ """ 1. The main causes of climate change include greenhouse gas emissions from human activities such as burning fossil fuels, deforestation, and industrial processes. @@ -169,7 +264,7 @@ def test_check_context_relevance(): 4. Other factors that contribute to climate change include methane emissions from livestock and rice paddies, as well as nitrous oxide emissions from agricultural fertilizers. 5. Changes in land use, such as urbanization and deforestation, also play a role in altering local climates and contributing to global climate change. """, - ] + ], } ) @@ -178,16 +273,27 @@ def test_check_context_reranking(): check = CheckContextReranking() output = check.setup(settings).run(context_reranking_dataset) assert isinstance(output, pl.DataFrame) - assert "score_context_reranking" in output.columns and "explanation_context_reranking" in output.columns - assert output["score_context_reranking"].dtype == pl.Float64 and len(output["score_context_reranking"]) - output["score_context_reranking"].null_count() > 0 - assert output["explanation_context_reranking"].dtype == pl.Utf8 and len(output["explanation_context_reranking"]) - output["explanation_context_reranking"].null_count() > 0 + assert ( + "score_context_reranking" in output.columns + and "explanation_context_reranking" in output.columns + ) + assert ( + output["score_context_reranking"].dtype == pl.Float64 + and len(output["score_context_reranking"]) + - output["score_context_reranking"].null_count() + > 0 + ) + assert ( + output["explanation_context_reranking"].dtype == pl.Utf8 + and len(output["explanation_context_reranking"]) + - output["explanation_context_reranking"].null_count() + > 0 + ) context_conciseness_dataset = pl.DataFrame( { - "question": [ - "What are the main causes of climate change?" - ], + "question": ["What are the main causes of climate change?"], "context": [ """ 1. The main causes of climate change include greenhouse gas emissions from human activities such as burning fossil fuels, deforestation, and industrial processes. @@ -202,62 +308,130 @@ def test_check_context_reranking(): 1. Climate change is primarily driven by human-induced factors, including the release of carbon dioxide and other greenhouse gases into the atmosphere. 2. The main causes of climate change include greenhouse gas emissions from human activities such as burning fossil fuels, deforestation, and industrial processes. """, - ] + ], } ) + def test_check_context_conciseness(): check = CheckContextConciseness() output = check.setup(settings).run(context_conciseness_dataset) assert isinstance(output, pl.DataFrame) - assert "score_context_conciseness" in output.columns and "explanation_context_conciseness" in output.columns - assert output["score_context_conciseness"].dtype == pl.Float64 and len(output["score_context_conciseness"]) - output["score_context_conciseness"].null_count() > 0 - assert output["explanation_context_conciseness"].dtype == pl.Utf8 and len(output["explanation_context_conciseness"]) - output["explanation_context_conciseness"].null_count() > 0 + assert ( + "score_context_conciseness" in output.columns + and "explanation_context_conciseness" in output.columns + ) + assert ( + output["score_context_conciseness"].dtype == pl.Float64 + and len(output["score_context_conciseness"]) + - output["score_context_conciseness"].null_count() + > 0 + ) + assert ( + output["explanation_context_conciseness"].dtype == pl.Utf8 + and len(output["explanation_context_conciseness"]) + - output["explanation_context_conciseness"].null_count() + > 0 + ) + def test_check_response_completeness_wrt_context(): check = CheckResponseCompletenessWrtContext() output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_response_completeness_wrt_context" in output.columns and "explanation_response_completeness_wrt_context" in output.columns - assert output["score_response_completeness_wrt_context"].dtype == pl.Float64 and len(output["score_response_completeness_wrt_context"]) - output["score_response_completeness_wrt_context"].null_count() > 0 - assert output["explanation_response_completeness_wrt_context"].dtype == pl.Utf8 and len(output["explanation_response_completeness_wrt_context"]) - output["explanation_response_completeness_wrt_context"].null_count() > 0 + assert ( + "score_response_completeness_wrt_context" in output.columns + and "explanation_response_completeness_wrt_context" in output.columns + ) + assert ( + output["score_response_completeness_wrt_context"].dtype == pl.Float64 + and len(output["score_response_completeness_wrt_context"]) + - output["score_response_completeness_wrt_context"].null_count() + > 0 + ) + assert ( + output["explanation_response_completeness_wrt_context"].dtype == pl.Utf8 + and len(output["explanation_response_completeness_wrt_context"]) + - output["explanation_response_completeness_wrt_context"].null_count() + > 0 + ) def test_check_response_facts(): check = CheckResponseFacts() output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_factual_accuracy" in output.columns and "explanation_factual_accuracy" in output.columns - assert output["score_factual_accuracy"].dtype == pl.Float64 and len(output["score_factual_accuracy"]) - output["score_factual_accuracy"].null_count() > 0 - assert output["explanation_factual_accuracy"].dtype == pl.Utf8 and len(output["explanation_factual_accuracy"]) - output["explanation_factual_accuracy"].null_count() > 0 + assert ( + "score_factual_accuracy" in output.columns + and "explanation_factual_accuracy" in output.columns + ) + assert ( + output["score_factual_accuracy"].dtype == pl.Float64 + and len(output["score_factual_accuracy"]) + - output["score_factual_accuracy"].null_count() + > 0 + ) + assert ( + output["explanation_factual_accuracy"].dtype == pl.Utf8 + and len(output["explanation_factual_accuracy"]) + - output["explanation_factual_accuracy"].null_count() + > 0 + ) # ----------------------------------------------------------- # Language Proficiency # ----------------------------------------------------------- + def test_check_language_quality(): check = CheckLanguageQuality() output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_critique_language" in output.columns and "explanation_critique_language" in output.columns - assert output["score_critique_language"].dtype == pl.Float64 and len(output["score_critique_language"]) - output["score_critique_language"].null_count() > 0 - assert output["explanation_critique_language"].dtype == pl.Utf8 and len(output["explanation_critique_language"]) - output["explanation_critique_language"].null_count() > 0 + assert ( + "score_critique_language" in output.columns + and "explanation_critique_language" in output.columns + ) + assert ( + output["score_critique_language"].dtype == pl.Float64 + and len(output["score_critique_language"]) + - output["score_critique_language"].null_count() + > 0 + ) + assert ( + output["explanation_critique_language"].dtype == pl.Utf8 + and len(output["explanation_critique_language"]) + - output["explanation_critique_language"].null_count() + > 0 + ) def test_check_tone_quality(): check = CheckToneQuality(llm_persona="wikipedia-bot") output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_critique_tone" in output.columns and "explanation_critique_tone" in output.columns - assert output["score_critique_tone"].dtype == pl.Float64 and len(output["score_critique_tone"]) - output["score_critique_tone"].null_count() > 0 - assert output["explanation_critique_tone"].dtype == pl.Utf8 and len(output["explanation_critique_tone"]) - output["explanation_critique_tone"].null_count() > 0 + assert ( + "score_critique_tone" in output.columns + and "explanation_critique_tone" in output.columns + ) + assert ( + output["score_critique_tone"].dtype == pl.Float64 + and len(output["score_critique_tone"]) + - output["score_critique_tone"].null_count() + > 0 + ) + assert ( + output["explanation_critique_tone"].dtype == pl.Utf8 + and len(output["explanation_critique_tone"]) + - output["explanation_critique_tone"].null_count() + > 0 + ) # # ----------------------------------------------------------- # # Code Hallucinations # # ----------------------------------------------------------- - + code_hallucination_dataset = pl.DataFrame( { "question": [ @@ -265,23 +439,37 @@ def test_check_tone_quality(): "Can I create histograms with different bucket colors in Streamlit", ], "context": [ - "This property lets you store Python primitives such as integers, floating-point numbers, complex numbers and booleans, dataframes, and even [lambdas](https://docs.python.org/3/reference/expressions.html#lambda) returned by functions. However, some execution environments may require serializing all data in Session State, so it may be useful to detect incompatibility during development, or when the execution environment will stop supporting it in the future.\n\nTo that end, Streamlit provides a `runner.enforceSerializableSessionState` [configuration option](https://docs.streamlit.io/library/advanced-features/configuration) that, when set to `true`, only allows pickle-serializable objects in Session State. To enable the option, either create a global or project config file with the following or use it as a command-line flag:\n\n\n```\n# .streamlit/config.toml\n[runner]\nenforceSerializableSessionState = true\n\n```\nBy \"*pickle-serializable*\", we mean calling `pickle.dumps(obj)` should not raise a [`PicklingError`](https://docs.python.org/3/library/pickle.html#pickle.PicklingError) exception. When the config option is enabled, adding unserializable data to session state should result in an exception. E.g.,\n\n\n```\nimport streamlit as st\n\ndef unserializable_data():\n return lambda x: x\n\n#👇 results in an exception when enforceSerializableSessionState is on\nst.session_state.unserializable = unserializable_data()\n\n```\n![UnserializableSessionStateError](https://docs.streamlit.io/images/unserializable-session-state-error.png)", - "eader(\"Define a custom colorscale\")\ndf = px.data.iris()\nfig = px.scatter(\n df,\n x=\"sepal_width\",\n y=\"sepal_length\",\n color=\"sepal_length\",\n color_continuous_scale=\"reds\",\n)\n\ntab1, tab2 = st.tabs([\"Streamlit theme (default)\", \"Plotly native theme\"])\nwith tab1:\n st.plotly_chart(fig, theme=\"streamlit\", use_container_width=True)\nwith tab2:\n st.plotly_chart(fig, theme=None, use_container_width=True)\n\n```\nNotice how the custom color scale is still reflected in the chart, even when the Streamlit theme is enabled 👇\n\nFor many more examples of Plotly charts with and without the Streamlit theme, check out the [plotly.streamlit.app](https://plotly.streamlit.app).\n\n" + 'This property lets you store Python primitives such as integers, floating-point numbers, complex numbers and booleans, dataframes, and even [lambdas](https://docs.python.org/3/reference/expressions.html#lambda) returned by functions. However, some execution environments may require serializing all data in Session State, so it may be useful to detect incompatibility during development, or when the execution environment will stop supporting it in the future.\n\nTo that end, Streamlit provides a `runner.enforceSerializableSessionState` [configuration option](https://docs.streamlit.io/library/advanced-features/configuration) that, when set to `true`, only allows pickle-serializable objects in Session State. To enable the option, either create a global or project config file with the following or use it as a command-line flag:\n\n\n```\n# .streamlit/config.toml\n[runner]\nenforceSerializableSessionState = true\n\n```\nBy "*pickle-serializable*", we mean calling `pickle.dumps(obj)` should not raise a [`PicklingError`](https://docs.python.org/3/library/pickle.html#pickle.PicklingError) exception. When the config option is enabled, adding unserializable data to session state should result in an exception. E.g.,\n\n\n```\nimport streamlit as st\n\ndef unserializable_data():\n return lambda x: x\n\n#👇 results in an exception when enforceSerializableSessionState is on\nst.session_state.unserializable = unserializable_data()\n\n```\n![UnserializableSessionStateError](https://docs.streamlit.io/images/unserializable-session-state-error.png)', + 'eader("Define a custom colorscale")\ndf = px.data.iris()\nfig = px.scatter(\n df,\n x="sepal_width",\n y="sepal_length",\n color="sepal_length",\n color_continuous_scale="reds",\n)\n\ntab1, tab2 = st.tabs(["Streamlit theme (default)", "Plotly native theme"])\nwith tab1:\n st.plotly_chart(fig, theme="streamlit", use_container_width=True)\nwith tab2:\n st.plotly_chart(fig, theme=None, use_container_width=True)\n\n```\nNotice how the custom color scale is still reflected in the chart, even when the Streamlit theme is enabled 👇\n\nFor many more examples of Plotly charts with and without the Streamlit theme, check out the [plotly.streamlit.app](https://plotly.streamlit.app).\n\n', ], "response": [ - "Serialization refers to the process of converting an object or data structure into a format that can be persisted and shared, and allowing you to recover the data’s original structure.\n\nBy default, Streamlit’s [Session State](https://docs.streamlit.io/library/advanced-features/session-state) allows you to persist any Python object for the duration of the session, irrespective of the object’s pickle-serializability. \n\nStreamlit provides a `runner.enforceSerializableSessionState` [configuration option](https://docs.streamlit.io/library/advanced-features/configuration) that, when set to `true`, only allows pickle-serializable objects in Session State.\n\nBy \"*pickle-serializable*\", we mean calling `pickle.dumps(obj)` should not raise a [`PicklingError`](https://docs.python.org/3/library/pickle.html#pickle.PicklingError) exception", - "The `color_continuous_scale` parameter in the `px.scatter` function allows you to define a custom color scale for the plot. You can use any of the built-in color scales provided by Plotly, or you can define your own custom color scale using a list of colors. The `color_continuous_scale` parameter accepts a string that specifies the name of the color scale to use. You can also pass a list of colors to the `color_continuous_scale` parameter to define a custom color scale. The list of colors can be any length, and the colors can be specified using any valid CSS color format, such as hex, RGB, or HSL. The `color_continuous_scale` parameter allows you to create a custom color scale for the plot that is tailored to your specific needs." + 'Serialization refers to the process of converting an object or data structure into a format that can be persisted and shared, and allowing you to recover the data’s original structure.\n\nBy default, Streamlit’s [Session State](https://docs.streamlit.io/library/advanced-features/session-state) allows you to persist any Python object for the duration of the session, irrespective of the object’s pickle-serializability. \n\nStreamlit provides a `runner.enforceSerializableSessionState` [configuration option](https://docs.streamlit.io/library/advanced-features/configuration) that, when set to `true`, only allows pickle-serializable objects in Session State.\n\nBy "*pickle-serializable*", we mean calling `pickle.dumps(obj)` should not raise a [`PicklingError`](https://docs.python.org/3/library/pickle.html#pickle.PicklingError) exception', + "The `color_continuous_scale` parameter in the `px.scatter` function allows you to define a custom color scale for the plot. You can use any of the built-in color scales provided by Plotly, or you can define your own custom color scale using a list of colors. The `color_continuous_scale` parameter accepts a string that specifies the name of the color scale to use. You can also pass a list of colors to the `color_continuous_scale` parameter to define a custom color scale. The list of colors can be any length, and the colors can be specified using any valid CSS color format, such as hex, RGB, or HSL. The `color_continuous_scale` parameter allows you to create a custom color scale for the plot that is tailored to your specific needs.", ], } ) + def test_check_code_hallucination(): check = CheckCodeHallucination() output = check.setup(settings).run(code_hallucination_dataset) assert isinstance(output, pl.DataFrame) - assert "score_code_hallucination" in output.columns and "explanation_code_hallucination" in output.columns - assert output["score_code_hallucination"].dtype == pl.Float64 and len(output["score_code_hallucination"]) - output["score_code_hallucination"].null_count() > 0 - assert output["explanation_code_hallucination"].dtype == pl.Utf8 and len(output["explanation_code_hallucination"]) - output["explanation_code_hallucination"].null_count() > 0 + assert ( + "score_code_hallucination" in output.columns + and "explanation_code_hallucination" in output.columns + ) + assert ( + output["score_code_hallucination"].dtype == pl.Float64 + and len(output["score_code_hallucination"]) + - output["score_code_hallucination"].null_count() + > 0 + ) + assert ( + output["explanation_code_hallucination"].dtype == pl.Utf8 + and len(output["explanation_code_hallucination"]) + - output["explanation_code_hallucination"].null_count() + > 0 + ) # # ----------------------------------------------------------- @@ -308,37 +496,69 @@ def test_check_code_hallucination(): Doctor: You should try to rest your knee. Patient: I have been resting it for a few days now. Doctor: I don't know what else to suggest. - """ + """, ] } ) + def test_check_conversation_satisfaction(): check = CheckConversationSatisfaction(user_role="Patient", llm_role="Doctor") output = check.setup(settings).run(conversation_satisfaction_dataset) assert isinstance(output, pl.DataFrame) - assert "score_conversation_satisfaction" in output.columns and "explanation_conversation_satisfaction" in output.columns - assert output["score_conversation_satisfaction"].dtype == pl.Float64 and len(output["score_conversation_satisfaction"]) - output["score_conversation_satisfaction"].null_count() > 0 - assert output["explanation_conversation_satisfaction"].dtype == pl.Utf8 and len(output["explanation_conversation_satisfaction"]) - output["explanation_conversation_satisfaction"].null_count() > 0 + assert ( + "score_conversation_satisfaction" in output.columns + and "explanation_conversation_satisfaction" in output.columns + ) + assert ( + output["score_conversation_satisfaction"].dtype == pl.Float64 + and len(output["score_conversation_satisfaction"]) + - output["score_conversation_satisfaction"].null_count() + > 0 + ) + assert ( + output["explanation_conversation_satisfaction"].dtype == pl.Utf8 + and len(output["explanation_conversation_satisfaction"]) + - output["explanation_conversation_satisfaction"].null_count() + > 0 + ) # ----------------------------------------------------------- # Custom Evaluations # ----------------------------------------------------------- - + + def test_check_guideline_adherence(): - check = CheckGuidelineAdherence(guideline="The response should not contain any numbers or statistic", guideline_name="guideline", response_schema=None) + check = CheckGuidelineAdherence( + guideline="The response should not contain any numbers or statistic", + guideline_name="guideline", + response_schema=None, + ) output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_guideline_adherence" in output.columns and "explanation_guideline_adherence" in output.columns - assert output["score_guideline_adherence"].dtype == pl.Float64 and len(output["score_guideline_adherence"]) - output["score_guideline_adherence"].null_count() > 0 - assert output["explanation_guideline_adherence"].dtype == pl.Utf8 and len(output["explanation_guideline_adherence"]) - output["explanation_guideline_adherence"].null_count() > 0 + assert ( + "score_guideline_adherence" in output.columns + and "explanation_guideline_adherence" in output.columns + ) + assert ( + output["score_guideline_adherence"].dtype == pl.Float64 + and len(output["score_guideline_adherence"]) + - output["score_guideline_adherence"].null_count() + > 0 + ) + assert ( + output["explanation_guideline_adherence"].dtype == pl.Utf8 + and len(output["explanation_guideline_adherence"]) + - output["explanation_guideline_adherence"].null_count() + > 0 + ) # # ----------------------------------------------------------- # # Compare response with ground truth # # ----------------------------------------------------------- - + # def test_check_response_matching(): # check = CheckResponseMatching() # output = check.setup(settings).run(dataset) @@ -351,29 +571,56 @@ def test_check_guideline_adherence(): # ----------------------------------------------------------- # Security # ----------------------------------------------------------- - + + def test_check_prompt_injection(): check = CheckPromptInjection() output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_prompt_injection" in output.columns and "explanation_prompt_injection" in output.columns - assert output["score_prompt_injection"].dtype == pl.Float64 and len(output["score_prompt_injection"]) - output["score_prompt_injection"].null_count() > 0 - assert output["explanation_prompt_injection"].dtype == pl.Utf8 and len(output["explanation_prompt_injection"]) - output["explanation_prompt_injection"].null_count() > 0 + assert ( + "score_prompt_injection" in output.columns + and "explanation_prompt_injection" in output.columns + ) + assert ( + output["score_prompt_injection"].dtype == pl.Float64 + and len(output["score_prompt_injection"]) + - output["score_prompt_injection"].null_count() + > 0 + ) + assert ( + output["explanation_prompt_injection"].dtype == pl.Utf8 + and len(output["explanation_prompt_injection"]) + - output["explanation_prompt_injection"].null_count() + > 0 + ) def test_check_jailbreak_detection(): check = CheckJailbreakDetection() output = check.setup(settings).run(dataset) assert isinstance(output, pl.DataFrame) - assert "score_jailbreak_attempted" in output.columns and "explanation_jailbreak_attempted" in output.columns - assert output["score_jailbreak_attempted"].dtype == pl.Float64 and len(output["score_jailbreak_attempted"]) - output["score_jailbreak_attempted"].null_count() > 0 - assert output["explanation_jailbreak_attempted"].dtype == pl.Utf8 and len(output["explanation_jailbreak_attempted"]) - output["explanation_jailbreak_attempted"].null_count() > 0 + assert ( + "score_jailbreak_attempted" in output.columns + and "explanation_jailbreak_attempted" in output.columns + ) + assert ( + output["score_jailbreak_attempted"].dtype == pl.Float64 + and len(output["score_jailbreak_attempted"]) + - output["score_jailbreak_attempted"].null_count() + > 0 + ) + assert ( + output["explanation_jailbreak_attempted"].dtype == pl.Utf8 + and len(output["explanation_jailbreak_attempted"]) + - output["explanation_jailbreak_attempted"].null_count() + > 0 + ) # ----------------------------------------------------------- # Sub Query # ----------------------------------------------------------- - + sub_query_dataset = pl.DataFrame( { "question": [ @@ -411,11 +658,25 @@ def test_check_jailbreak_detection(): ], } ) - + + def test_check_sub_query_completeness(): check = CheckSubQueryCompleteness() output = check.setup(settings).run(sub_query_dataset) assert isinstance(output, pl.DataFrame) - assert "score_sub_query_completeness" in output.columns and "explanation_sub_query_completeness" in output.columns - assert output["score_sub_query_completeness"].dtype == pl.Float64 and len(output["score_sub_query_completeness"]) - output["score_sub_query_completeness"].null_count() > 0 - assert output["explanation_sub_query_completeness"].dtype == pl.Utf8 and len(output["explanation_sub_query_completeness"]) - output["explanation_sub_query_completeness"].null_count() > 0 + assert ( + "score_sub_query_completeness" in output.columns + and "explanation_sub_query_completeness" in output.columns + ) + assert ( + output["score_sub_query_completeness"].dtype == pl.Float64 + and len(output["score_sub_query_completeness"]) + - output["score_sub_query_completeness"].null_count() + > 0 + ) + assert ( + output["explanation_sub_query_completeness"].dtype == pl.Utf8 + and len(output["explanation_sub_query_completeness"]) + - output["explanation_sub_query_completeness"].null_count() + > 0 + ) diff --git a/uptrain/cli.py b/uptrain/cli.py index 37ae889b4..d1db0751a 100644 --- a/uptrain/cli.py +++ b/uptrain/cli.py @@ -18,7 +18,7 @@ "tqdm>=4.0", ], "st_classic": [ - "plotly>=5.0.0", + "plotly>=5.0.0", "streamlit>=1.23", "pyarrow>=10.0.0", ], diff --git a/uptrain/dashboard/backend/app.py b/uptrain/dashboard/backend/app.py index 3b6556459..16c747fac 100644 --- a/uptrain/dashboard/backend/app.py +++ b/uptrain/dashboard/backend/app.py @@ -41,12 +41,7 @@ from loguru import logger from sqlalchemy.orm import Session -from uptrain.utilities.db import ( - create_database, - ModelDataset, - ModelUser, - ModelPrompt -) +from uptrain.utilities.db import create_database, ModelDataset, ModelUser, ModelPrompt from uptrain.utilities.utils import ( get_sqlite_utils_db, _get_fsspec_filesystem, @@ -54,7 +49,7 @@ convert_project_to_dicts, checks_mapping, create_dirs, - get_current_datetime + get_current_datetime, ) from uptrain.utilities import polars_to_pandas @@ -74,9 +69,10 @@ def _row_to_dict(row): ACCESS_TOKEN = APIKeyHeader(name="uptrain-access-token", auto_error=False) # database -#/data/uptrain-server.db" +# /data/uptrain-server.db" create_dirs(DATABASE_PATH) -SessionLocal = create_database("sqlite:///" + DATABASE_PATH + 'uptrain-local-server.db') +SessionLocal = create_database("sqlite:///" + DATABASE_PATH + "uptrain-local-server.db") + def _create_user(db: Session, name: str): """Create a new user.""" @@ -90,6 +86,7 @@ def _create_user(db: Session, name: str): db.rollback() raise exc + def get_db(): """Get the database session.""" db = SessionLocal() @@ -98,9 +95,10 @@ def get_db(): finally: SessionLocal.remove() + try: _create_user(SessionLocal(), "default_key") -except: +except Exception: pass # some methods need a context manager to get the db @@ -115,9 +113,7 @@ def get_fsspec_fs(): pass -async def validate_api_key_public( - key_header: str = Security(ACCESS_TOKEN) -) -> str: +async def validate_api_key_public(key_header: str = Security(ACCESS_TOKEN)) -> str: """Validate API key and return the user id. For public API, the API key is the access token provided to them by uptrain and we @@ -128,16 +124,13 @@ async def validate_api_key_public( raise HTTPException(status_code=403, detail="Unspecified API key") else: with get_db_context() as db: - db_item = ( - db.query(ModelUser).filter_by(name=key_header).first() - ) + db_item = db.query(ModelUser).filter_by(name=key_header).first() if db_item is not None: return db_item.id else: raise HTTPException(status_code=403, detail="Invalid API key") - # ----------------------------------------------------------- # Routers # ----------------------------------------------------------- @@ -146,9 +139,10 @@ async def validate_api_key_public( router_internal = APIRouter() # ----------------------------------------------------------- -# Internal API +# Internal API # ----------------------------------------------------------- + @router_internal.post("/user") def add_user(user: app_schema.UserCreate, db: Session = Depends(get_db)): """Add a new user.""" @@ -173,15 +167,17 @@ def add_user(user: app_schema.UserCreate, db: Session = Depends(get_db)): # Request to get user name, API key, user credits used and total using api key @router_public.post("/user") def get_user( - user_id: str = Depends(validate_api_key_public), - db: Session = Depends(get_db) + user_id: str = Depends(validate_api_key_public), db: Session = Depends(get_db) ): user = db.query(ModelUser).filter_by(id=user_id).first() if user is None: raise HTTPException(status_code=404, detail="User not found") else: - return {"id" : user_id, "user_name" : "open-source user", "api_key" : "default_key"} - + return { + "id": user_id, + "user_name": "open-source user", + "api_key": "default_key", + } @router_public.get("/get_project_data", response_model=app_schema.ProjectData) @@ -191,15 +187,14 @@ def get_project_data( db: Session = Depends(get_db), user_id: str = Depends(validate_api_key_public), ): - """Get all the data for a particular project_name for the given user. - """ + """Get all the data for a particular project_name for the given user.""" projects = get_projects_list(num_days=num_days, db=db, user_id=user_id) for project in projects.data: if project["project"] == project_name: run_via = project["run_via"] if run_via == "project" or run_via == "experiment": - if run_via == 'project': + if run_via == "project": query = f""" SELECT * FROM results @@ -211,13 +206,15 @@ def get_project_data( FROM results WHERE project = '{project_name}' AND metadata LIKE '%uptrain_experiment_columns%' AND timestamp > datetime('now', '-{num_days} days') """ - fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db") + fpath = os.path.join( + DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db" + ) if not os.path.exists(fpath): raise HTTPException( status_code=404, detail="No evaluations run yet for this user" ) DB = get_sqlite_utils_db(fpath) - + buffer = io.StringIO() for row in DB.query(query): buffer.write(json.dumps(row) + "\n") @@ -228,27 +225,65 @@ def get_project_data( for key in details: try: details[key] = json.loads(details[key]) - except: + except Exception: pass data.append(details) - scores = [col[6:] for col in data[0]['checks'].keys() if col.startswith("score_")] + scores = [ + col[6:] + for col in data[0]["checks"].keys() + if col.startswith("score_") + ] if run_via == "project": - return app_schema.ProjectData(data = [data, None, project["latest_timestamp"][:10], None, scores], project_name = project_name) + return app_schema.ProjectData( + data=[ + data, + None, + project["latest_timestamp"][:10], + None, + scores, + ], + project_name=project_name, + ) else: exp_data = convert_project_to_polars(data) exp_column = str(exp_data["uptrain_experiment_columns"][0][0]) plot_data = {} for col in scores: - col_name = 'score_' + col - plot_data.update({col : exp_data.group_by([exp_column], maintain_order=True).agg(pl.col(col_name)).to_dicts()}) + col_name = "score_" + col + plot_data.update( + { + col: exp_data.group_by( + [exp_column], maintain_order=True + ) + .agg(pl.col(col_name)) + .to_dicts() + } + ) columns = exp_data.columns - columns.remove('question') - display_data = exp_data.group_by(["question"], maintain_order=True).agg(pl.col(col) for col in columns).to_dicts() - unqiue_values = list(set(exp_data[exp_column].to_list())) - return app_schema.ProjectData(data = [display_data, None, project["latest_timestamp"][:10], None, scores, unqiue_values, exp_column, plot_data], project_name = project_name) - + columns.remove("question") + display_data = ( + exp_data.group_by(["question"], maintain_order=True) + .agg(pl.col(col) for col in columns) + .to_dicts() + ) + unqiue_values = list(set(exp_data[exp_column].to_list())) + return app_schema.ProjectData( + data=[ + display_data, + None, + project["latest_timestamp"][:10], + None, + scores, + unqiue_values, + exp_column, + plot_data, + ], + project_name=project_name, + ) + + @router_public.get("/get_prompt_data", response_model=app_schema.ProjectData) def get_prompt_data( project_name: str, @@ -256,8 +291,7 @@ def get_prompt_data( db: Session = Depends(get_db), user_id: str = Depends(validate_api_key_public), ): - """Get all the data for a particular project_name for the given user. - """ + """Get all the data for a particular project_name for the given user.""" projects = get_projects_list(num_days=num_days, db=db, user_id=user_id) for project in projects.data: @@ -269,13 +303,15 @@ def get_prompt_data( FROM results WHERE project = '{project_name}' AND metadata like '%prompt_version%' AND metadata NOT LIKE '%uptrain_experiment_columns%' AND timestamp > datetime('now', '-{num_days} days') """ - fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db") + fpath = os.path.join( + DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db" + ) if not os.path.exists(fpath): raise HTTPException( status_code=404, detail="No evaluations run yet for this user" ) DB = get_sqlite_utils_db(fpath) - + buffer = io.StringIO() for row in DB.query(query): buffer.write(json.dumps(row) + "\n") @@ -286,34 +322,40 @@ def get_prompt_data( for key in details: try: details[key] = json.loads(details[key]) - except: + except Exception: pass data.append(details) exp_data, checks_mapping = convert_project_to_dicts(data) - + columns = exp_data.columns - columns.remove('prompt_name') - columns.remove('prompt_version') - data = exp_data.group_by(['prompt_name', 'prompt_version'], maintain_order=True).agg(pl.col(col) for col in columns) + columns.remove("prompt_name") + columns.remove("prompt_version") + data = exp_data.group_by( + ["prompt_name", "prompt_version"], maintain_order=True + ).agg(pl.col(col) for col in columns) columns = data.columns - columns.remove('prompt_name') - data = data.group_by(['prompt_name'], maintain_order=True).agg(pl.col(col) for col in columns).to_dicts() - + columns.remove("prompt_name") + data = ( + data.group_by(["prompt_name"], maintain_order=True) + .agg(pl.col(col) for col in columns) + .to_dicts() + ) + for row in data: - row['scores'] = [] - uuid_tags_version = row['uuid_tag'] + row["scores"] = [] + uuid_tags_version = row["uuid_tag"] for uuid_tags in uuid_tags_version: scores = [] for uuid in uuid_tags: score = checks_mapping[uuid] scores.append(score) - row['scores'].append(pl.DataFrame(scores).mean().to_dicts()[0]) + row["scores"].append(pl.DataFrame(scores).mean().to_dicts()[0]) res = [] for prompt in data: prompt_data = [] - num_versions = len(prompt['prompt_version']) + num_versions = len(prompt["prompt_version"]) for i in range(num_versions): prompt_v = {} for key, value in prompt.items(): @@ -323,16 +365,32 @@ def get_prompt_data( # Remove the explanations from the scores elif key == "scores": try: - value = [{k: round(float(v), 3) for k, v in score.items() if not k.startswith("explanation")} for score in value] - except: - value = [{k: v for k, v in score.items() if not k.startswith("explanation")} for score in value] + value = [ + { + k: round(float(v), 3) + for k, v in score.items() + if not k.startswith("explanation") + } + for score in value + ] + except Exception: + value = [ + { + k: v + for k, v in score.items() + if not k.startswith("explanation") + } + for score in value + ] # Handle cases where the value is a list or a string if isinstance(value, list): prompt_v[key] = value[i] else: prompt_v[key] = value prompt_data.append(prompt_v) - res.append({"prompt_name": prompt["prompt_name"], "prompts": prompt_data}) + res.append( + {"prompt_name": prompt["prompt_name"], "prompts": prompt_data} + ) return app_schema.ProjectData(data=res, project_name=project_name) @@ -342,7 +400,7 @@ async def add_project_data( user_id: str = Depends(validate_api_key_public), db: Session = Depends(get_db), ): - + fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db") DB = get_sqlite_utils_db(fpath) @@ -352,7 +410,7 @@ async def add_project_data( checks = eval_args.checks project = eval_args.project timestamp = get_current_datetime() - try: + try: DB["results"].insert_all( [ { @@ -361,16 +419,18 @@ async def add_project_data( "metadata": metadata, "schema": schema, "project": project, - "timestamp": timestamp + "timestamp": timestamp, } - for row_data, row_check in zip(results, checks) + for row_data, row_check in zip(results, checks) ] ) except Exception as e: logger.exception(f"Error running the eval: {e}") - raise HTTPException(status_code=500, detail=f"Error saving the data for the project: {e}") + raise HTTPException( + status_code=500, detail=f"Error saving the data for the project: {e}" + ) + - @router_public.get("/get_projects_list", response_model=app_schema.ProjectsList) def get_projects_list( num_days: int = 200, @@ -379,8 +439,7 @@ def get_projects_list( db: Session = Depends(get_db), user_id: str = Depends(validate_api_key_public), ): - """Get all the project names associated with the user. - """ + """Get all the project names associated with the user.""" user = db.query(ModelUser).filter_by(id=user_id).first() if user is None: raise HTTPException(status_code=403, detail="Invalid user name") @@ -395,15 +454,15 @@ def get_projects_list( ORDER BY latest_timestamp DESC LIMIT {limit} """ - fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db") + fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db") if not os.path.exists(fpath): raise HTTPException( status_code=404, detail="No evaluations run yet for this user" ) DB = get_sqlite_utils_db(fpath) - experiment_runs = DB.query(query) - except: + experiment_runs = DB.query(query) + except Exception: experiment_runs = [] try: @@ -415,20 +474,22 @@ def get_projects_list( ORDER BY latest_timestamp DESC LIMIT {limit} """ - fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db") + fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db") if not os.path.exists(fpath): raise HTTPException( status_code=404, detail="No evaluations run yet for this user" ) DB = get_sqlite_utils_db(fpath) - project_runs = DB.query(query) - except: + project_runs = DB.query(query) + except Exception: project_runs = [] try: - prompts_runs = get_prompts_list(num_days=num_days, limit=limit, db=db, user_id=user_id) - except: + prompts_runs = get_prompts_list( + num_days=num_days, limit=limit, db=db, user_id=user_id + ) + except Exception: prompts_runs = [] out = [] @@ -447,7 +508,7 @@ def get_projects_list( { "project": run["project"], "latest_timestamp": run["latest_timestamp"], - "run_via": "experiment" + "run_via": "experiment", } ) @@ -456,12 +517,12 @@ def get_projects_list( { "project": run["project"], "latest_timestamp": run["latest_timestamp"], - "run_via": "prompt" + "run_via": "prompt", } ) out.sort(reverse=True, key=lambda x: x["latest_timestamp"]) - return app_schema.ProjectsList(data = out, user_name = user_name) + return app_schema.ProjectsList(data=out, user_name=user_name) @router_public.get("/get_evaluations_list", response_model=app_schema.ProjectsList) @@ -472,8 +533,7 @@ def get_evaluations_list( db: Session = Depends(get_db), user_id: str = Depends(validate_api_key_public), ): - """Get all the project names associated with the user. - """ + """Get all the project names associated with the user.""" user = db.query(ModelUser).filter_by(id=user_id).first() if user is None: raise HTTPException(status_code=403, detail="Invalid user name") @@ -490,19 +550,19 @@ def get_evaluations_list( LIMIT {limit} """ - fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db") + fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db") if not os.path.exists(fpath): raise HTTPException( status_code=404, detail="No evaluations run yet for this user" ) DB = get_sqlite_utils_db(fpath) - project_runs = DB.query(query) - except: + project_runs = DB.query(query) + except Exception: project_runs = [] out = [] - + for run in project_runs: out.append( { @@ -513,7 +573,7 @@ def get_evaluations_list( ) out.sort(reverse=True, key=lambda x: x["latest_timestamp"]) - return app_schema.ProjectsList(data = out, user_name = user_name) + return app_schema.ProjectsList(data=out, user_name=user_name) @router_public.get("/get_experiments_list", response_model=app_schema.ProjectsList) @@ -523,8 +583,7 @@ def get_experiments_list( db: Session = Depends(get_db), user_id: str = Depends(validate_api_key_public), ): - """Get all the experiment names associated with the user. - """ + """Get all the experiment names associated with the user.""" user = db.query(ModelUser).filter_by(id=user_id).first() if user is None: raise HTTPException(status_code=403, detail="Invalid user name") @@ -541,15 +600,15 @@ def get_experiments_list( LIMIT {limit} """ - fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db") + fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db") if not os.path.exists(fpath): raise HTTPException( status_code=404, detail="No evaluations run yet for this user" ) DB = get_sqlite_utils_db(fpath) - project_runs = DB.query(query) - except: + project_runs = DB.query(query) + except Exception: project_runs = [] out = [] @@ -564,7 +623,7 @@ def get_experiments_list( ) out.sort(reverse=True, key=lambda x: x["latest_timestamp"]) - return app_schema.ProjectsList(data = out, user_name = user_name) + return app_schema.ProjectsList(data=out, user_name=user_name) @router_public.get("/get_prompts_list", response_model=app_schema.ProjectsList) @@ -574,8 +633,7 @@ def get_prompts_list( db: Session = Depends(get_db), user_id: str = Depends(validate_api_key_public), ): - """Get all the experiment names associated with the user. - """ + """Get all the experiment names associated with the user.""" user = db.query(ModelUser).filter_by(id=user_id).first() if user is None: raise HTTPException(status_code=403, detail="Invalid user name") @@ -591,15 +649,15 @@ def get_prompts_list( ORDER BY latest_timestamp DESC LIMIT {limit} """ - fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db") + fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db") if not os.path.exists(fpath): raise HTTPException( status_code=404, detail="No evaluations run yet for this user" ) DB = get_sqlite_utils_db(fpath) - prompts_runs = DB.query(query) - except: + prompts_runs = DB.query(query) + except Exception: prompts_runs = [] out = [] @@ -614,12 +672,12 @@ def get_prompts_list( ) out.sort(reverse=True, key=lambda x: x["latest_timestamp"]) - return app_schema.ProjectsList(data = out, user_name = user_name) + return app_schema.ProjectsList(data=out, user_name=user_name) @router_public.post("/find_common_topic") async def find_common_topic( - args : app_schema.TopicGenerate, + args: app_schema.TopicGenerate, db: Session = Depends(get_db), user_id: str = Depends(validate_api_key_public), ): @@ -632,22 +690,32 @@ async def find_common_topic( for elem in dataset: if elem[1] is not None and elem[1] == 0.0: refined_items.append(elem[0]) - - refined_items = refined_items[:min(50, len(refined_items))] - data = list(map(lambda x: {'question': x, 'cluster_index' : 0, 'cluster_index_distance' : 0}, refined_items)) + + refined_items = refined_items[: min(50, len(refined_items))] + data = list( + map( + lambda x: {"question": x, "cluster_index": 0, "cluster_index_distance": 0}, + refined_items, + ) + ) from uptrain.operators import TopicGenerator + user = db.query(ModelUser).filter_by(id=user_id).first() if user is None: raise HTTPException(status_code=403, detail="Invalid user name") else: user_name = user.name - - user_headers={"openai_api_key": user_name} - + + user_headers = {"openai_api_key": user_name} + try: - result = TopicGenerator().setup(Settings(**user_headers)).run(pl.DataFrame(data))['output'] - return {'common_topic': result.to_dicts()[0]['topic']} + result = ( + TopicGenerator() + .setup(Settings(**user_headers)) + .run(pl.DataFrame(data))["output"] + ) + return {"common_topic": result.to_dicts()[0]["topic"]} except Exception as exc: logger.exception("Error creating run") db.rollback() @@ -667,7 +735,7 @@ async def add_evaluation( fsspec_fs: t.Any = Depends(get_fsspec_fs), ): ## project key would be present in the eval_args.metadata - + existing_dataset = ( db.query(ModelDataset) .filter_by(name=dataset_name, user_id=user_id) @@ -718,18 +786,31 @@ async def add_evaluation( checks_1.append(final_check) settings_data = {} - settings_data['model'] = model + settings_data["model"] = model settings_data.update(metadata[model]) try: from uptrain import EvalLLM + user_client = EvalLLM(Settings(**settings_data)) - data = JsonReader(fpath = os.path.join(DATABASE_PATH, "uptrain-datasets", name_w_version)).setup(Settings()).run()['output'].to_dicts() - results = user_client.evaluate(data=data, checks=checks_1, project_name=project_name) + data = ( + JsonReader( + fpath=os.path.join(DATABASE_PATH, "uptrain-datasets", name_w_version) + ) + .setup(Settings()) + .run()["output"] + .to_dicts() + ) + results = user_client.evaluate( + data=data, checks=checks_1, project_name=project_name + ) return {"message": f"Evaluation has been queued up"} except Exception as e: logger.exception(f"Error running the eval: {e}") - raise HTTPException(status_code=500, detail=f"Error running the evaluation: {e}") + raise HTTPException( + status_code=500, detail=f"Error running the evaluation: {e}" + ) + @router_public.post("/add_prompts") async def add_prompts( @@ -752,7 +833,7 @@ async def add_prompts( raise HTTPException(status_code=403, detail="Invalid user name") else: user_name = user.name - + existing_dataset = ( db.query(ModelDataset) .filter_by(name=dataset_name, user_id=user_id) @@ -787,7 +868,7 @@ async def add_prompts( raise HTTPException( status_code=400, detail="Error adding/updating dataset to platform" ) - + existing_prompt = ( db.query(ModelPrompt) .filter_by(name=prompt_name, user_id=user_id) @@ -800,10 +881,7 @@ async def add_prompts( version = 1 try: db_item = ModelPrompt( - user_id=user_id, - name=prompt_name, - version=version, - prompt=prompt + user_id=user_id, name=prompt_name, version=version, prompt=prompt ) db.add(db_item) db.commit() @@ -813,7 +891,7 @@ async def add_prompts( raise HTTPException( status_code=400, detail="Error adding/updating prompts to platform" ) - + checks = eval(checks[0]) checks_1 = [] metadata = eval(metadata) @@ -828,31 +906,46 @@ async def add_prompts( checks_1.append(final_check) settings_data = {} - settings_data['model'] = model + settings_data["model"] = model settings_data.update(metadata[model]) from uptrain.operators import JsonReader from uptrain import Settings as UserSettings metadata = None - metadata = {'project': project_name, 'prompt': prompt, 'prompt_name': prompt_name,'prompt_version': version, 'model': model} + metadata = { + "project": project_name, + "prompt": prompt, + "prompt_name": prompt_name, + "prompt_version": version, + "model": model, + } try: from uptrain import EvalLLM + user_client = EvalLLM(Settings(**settings_data)) - data = JsonReader(fpath = os.path.join(DATABASE_PATH, "uptrain-datasets", name_w_version)).setup(UserSettings()).run()['output'].to_dicts() + data = ( + JsonReader( + fpath=os.path.join(DATABASE_PATH, "uptrain-datasets", name_w_version) + ) + .setup(UserSettings()) + .run()["output"] + .to_dicts() + ) results = user_client.evaluate_prompts( project_name=project_name, - data=data, + data=data, checks=checks_1, - prompt=prompt, - metadata=metadata - ) + prompt=prompt, + metadata=metadata, + ) return {"message": f"Evaluation has been queued up"} except Exception as e: logger.exception(f"Error running the eval: {e}") - raise HTTPException(status_code=500, detail=f"Error running the evaluation: {e}") - + raise HTTPException( + status_code=500, detail=f"Error running the evaluation: {e}" + ) # ----------------------------------------------------------- @@ -873,4 +966,4 @@ async def add_prompts( app.include_router(router_internal, prefix="/api/internal", tags=["internal"]) if __name__ == "__main__": - uvicorn.run("app:app", host="0.0.0.0", port=4300, workers=3) \ No newline at end of file + uvicorn.run("app:app", host="0.0.0.0", port=4300, workers=3) diff --git a/uptrain/dashboard/backend/nest_asyncio.py b/uptrain/dashboard/backend/nest_asyncio.py index 718b89a23..4c96811c6 100644 --- a/uptrain/dashboard/backend/nest_asyncio.py +++ b/uptrain/dashboard/backend/nest_asyncio.py @@ -26,7 +26,7 @@ def run(main, *, debug=False): try: loop = asyncio.get_event_loop() except RuntimeError as e: - if str(e).startswith('There is no current event loop in thread'): + if str(e).startswith("There is no current event loop in thread"): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) else: @@ -48,19 +48,20 @@ def _get_event_loop(stacklevel=3): return loop # Use module level _current_tasks, all_tasks and patch run method. - if hasattr(asyncio, '_nest_patched'): + if hasattr(asyncio, "_nest_patched"): return if sys.version_info >= (3, 6, 0): - asyncio.Task = asyncio.tasks._CTask = asyncio.tasks.Task = \ - asyncio.tasks._PyTask - asyncio.Future = asyncio.futures._CFuture = asyncio.futures.Future = \ + asyncio.Task = asyncio.tasks._CTask = asyncio.tasks.Task = asyncio.tasks._PyTask + asyncio.Future = asyncio.futures._CFuture = asyncio.futures.Future = ( asyncio.futures._PyFuture + ) if sys.version_info < (3, 7, 0): asyncio.tasks._current_tasks = asyncio.tasks.Task._current_tasks asyncio.all_tasks = asyncio.tasks.Task.all_tasks if sys.version_info >= (3, 9, 0): - events._get_event_loop = events.get_event_loop = \ - asyncio.get_event_loop = _get_event_loop + events._get_event_loop = events.get_event_loop = asyncio.get_event_loop = ( + _get_event_loop + ) asyncio.run = run asyncio._nest_patched = True @@ -100,8 +101,7 @@ def run_until_complete(self, future): if self._stopping: break if not f.done(): - raise RuntimeError( - 'Event loop stopped before Future completed.') + raise RuntimeError("Event loop stopped before Future completed.") return f.result() def _run_once(self): @@ -115,10 +115,14 @@ def _run_once(self): heappop(scheduled) timeout = ( - 0 if ready or self._stopping - else min(max( - scheduled[0]._when - self.time(), 0), 86400) if scheduled - else None) + 0 + if ready or self._stopping + else ( + min(max(scheduled[0]._when - self.time(), 0), 86400) + if scheduled + else None + ) + ) event_list = self._selector.select(timeout) self._process_events(event_list) @@ -164,8 +168,10 @@ def manage_run(self): events._set_running_loop(old_running_loop) self._num_runs_pending -= 1 if self._is_proactorloop: - if (self._num_runs_pending == 0 - and self._self_reading_future is not None): + if ( + self._num_runs_pending == 0 + and self._self_reading_future is not None + ): ov = self._self_reading_future._ov self._self_reading_future.cancel() if ov is not None: @@ -174,7 +180,7 @@ def manage_run(self): @contextmanager def manage_asyncgens(self): - if not hasattr(sys, 'get_asyncgen_hooks'): + if not hasattr(sys, "get_asyncgen_hooks"): # Python version is too old. return old_agen_hooks = sys.get_asyncgen_hooks() @@ -183,7 +189,8 @@ def manage_asyncgens(self): if self._asyncgens is not None: sys.set_asyncgen_hooks( firstiter=self._asyncgen_firstiter_hook, - finalizer=self._asyncgen_finalizer_hook) + finalizer=self._asyncgen_finalizer_hook, + ) yield finally: self._set_coroutine_origin_tracking(False) @@ -194,10 +201,10 @@ def _check_running(self): """Do not throw exception if loop is already running.""" pass - if hasattr(loop, '_nest_patched'): + if hasattr(loop, "_nest_patched"): return if not isinstance(loop, asyncio.BaseEventLoop): - raise ValueError('Can\'t patch loop of type %s' % type(loop)) + raise ValueError("Can't patch loop of type %s" % type(loop)) cls = loop.__class__ cls.run_forever = run_forever cls.run_until_complete = run_until_complete @@ -205,12 +212,16 @@ def _check_running(self): cls._check_running = _check_running cls._check_runnung = _check_running # typo in Python 3.7 source cls._num_runs_pending = 1 if loop.is_running() else 0 - cls._is_proactorloop = ( - os.name == 'nt' and issubclass(cls, asyncio.ProactorEventLoop)) + cls._is_proactorloop = os.name == "nt" and issubclass( + cls, asyncio.ProactorEventLoop + ) if sys.version_info < (3, 7, 0): cls._set_coroutine_origin_tracking = cls._set_coroutine_wrapper - curr_tasks = asyncio.tasks._current_tasks \ - if sys.version_info >= (3, 7, 0) else asyncio.Task._current_tasks + curr_tasks = ( + asyncio.tasks._current_tasks + if sys.version_info >= (3, 7, 0) + else asyncio.Task._current_tasks + ) cls._nest_patched = True @@ -219,8 +230,9 @@ def _patch_tornado(): If tornado is imported before nest_asyncio, make tornado aware of the pure-Python asyncio Future. """ - if 'tornado' in sys.modules: + if "tornado" in sys.modules: import tornado.concurrent as tc # type: ignore + tc.Future = asyncio.Future if asyncio.Future not in tc.FUTURES: - tc.FUTURES += (asyncio.Future,) \ No newline at end of file + tc.FUTURES += (asyncio.Future,) diff --git a/uptrain/framework/base.py b/uptrain/framework/base.py index 9c7fd331b..e6945a053 100644 --- a/uptrain/framework/base.py +++ b/uptrain/framework/base.py @@ -8,10 +8,15 @@ from loguru import logger import networkx as nx import polars as pl -from pydantic import BaseSettings, Field +from pydantic import Field -from uptrain.operators.base import * +from uptrain.operators.base import ( + Operator, + TransformOp, + deserialize_operator, +) from uptrain.utilities import to_py_types, jsondump, jsonload +from pydantic_settings import BaseSettings, SettingsConfigDict __all__ = [ "OperatorDAG", @@ -23,32 +28,46 @@ class Settings(BaseSettings): # uptrain stores logs in this folder logs_folder: str = "/tmp/uptrain-logs" # external api related - openai_api_key: str = Field(None, env="OPENAI_API_KEY") - cohere_api_key: str = Field(None, env="COHERE_API_KEY") - huggingface_api_key: str = Field(None, env="HUGGINGFACE_API_KEY") - anthropic_api_key: str = Field(None, env="ANTHROPIC_API_KEY") - replicate_api_token: str = Field(None, env="REPLICATE_API_TOKEN") - anyscale_api_key: str = Field(None, env="ANYSCALE_API_KEY") - together_api_key: str = Field(None, env="TOGETHER_API_KEY") - mistral_api_key: str = Field(None, env="MISTRAL_API_KEY") - - azure_api_key: str = Field(None, env="AZURE_API_KEY") - azure_api_base: str = Field(None, env="AZURE_API_BASE") - azure_api_version: str = Field(None, env="AZURE_API_VERSION") + openai_api_key: t.Optional[str] = Field(None, validation_alias="OPENAI_API_KEY") + cohere_api_key: t.Optional[str] = Field(None, validation_alias="COHERE_API_KEY") + huggingface_api_key: t.Optional[str] = Field( + None, validation_alias="HUGGINGFACE_API_KEY" + ) + anthropic_api_key: t.Optional[str] = Field( + None, validation_alias="ANTHROPIC_API_KEY" + ) + replicate_api_token: t.Optional[str] = Field( + None, validation_alias="REPLICATE_API_TOKEN" + ) + anyscale_api_key: t.Optional[str] = Field(None, validation_alias="ANYSCALE_API_KEY") + together_api_key: t.Optional[str] = Field(None, validation_alias="TOGETHER_API_KEY") + mistral_api_key: t.Optional[str] = Field(None, validation_alias="MISTRAL_API_KEY") + + azure_api_key: t.Optional[str] = Field(None, validation_alias="AZURE_API_KEY") + azure_api_base: t.Optional[str] = Field(None, validation_alias="AZURE_API_BASE") + azure_api_version: t.Optional[str] = Field( + None, validation_alias="AZURE_API_VERSION" + ) rpm_limit: int = 100 tpm_limit: int = 90_000 embedding_compute_method: t.Literal["local", "replicate", "api"] = "local" # uptrain managed service related - uptrain_access_token: str = Field(None, env="UPTRAIN_ACCESS_TOKEN") + uptrain_access_token: t.Optional[str] = Field( + None, validation_alias="UPTRAIN_ACCESS_TOKEN" + ) uptrain_server_url: str = Field( - "https://demo.uptrain.ai/", env="UPTRAIN_SERVER_URL" + "https://demo.uptrain.ai/", validation_alias="UPTRAIN_SERVER_URL" ) # Embedding model related, applicable if embedding_compute_method is api. - embedding_model_url: str = Field(None, env="EMBEDDING_MODEL_URL") - embedding_model_api_token: str = Field(None, env="EMBEDDING_MODEL_API_TOKEN") + embedding_model_url: t.Optional[str] = Field( + None, validation_alias="EMBEDDING_MODEL_URL" + ) + embedding_model_api_token: t.Optional[str] = Field( + None, validation_alias="EMBEDDING_MODEL_API_TOKEN" + ) # LLM model to run the evaluations model: str = "gpt-3.5-turbo-1106" @@ -60,10 +79,7 @@ class Settings(BaseSettings): # Cot -> We will use chain of thought prompting to evaluate and get the grade # basic -> We will simply prompt the LLM to return the grade without any reasoning eval_type: t.Literal["basic", "cot"] = "cot" - - # allow additional fields as needed by different operators - class Config: - extra = "allow" + model_config = SettingsConfigDict(extra="allow") def __init__(self, **data): super().__init__(**data) diff --git a/uptrain/framework/builtins.py b/uptrain/framework/builtins.py index 50a386bf9..1bea8adaf 100644 --- a/uptrain/framework/builtins.py +++ b/uptrain/framework/builtins.py @@ -30,6 +30,7 @@ # Response Quality # ----------------------------------------------------------- + def CheckResponseCompleteness(): return Check( name="response_completeness_score", @@ -82,6 +83,7 @@ def CheckResponseMatching(method="llm"): # Context Quality # ----------------------------------------------------------- + def CheckContextRelevance(): return Check( name="score_context_relevance", @@ -126,6 +128,7 @@ def CheckContextConciseness(): # Language Proficiency # ----------------------------------------------------------- + def CheckLanguageQuality(): return Check( name="language_critique_score", @@ -148,6 +151,7 @@ def CheckToneQuality(llm_persona): # Code generation # ----------------------------------------------------------- + def CheckCodeHallucination(): return Check( name="code_hallucination_score", @@ -160,10 +164,13 @@ def CheckCodeHallucination(): # Conversation Quality # ----------------------------------------------------------- + def CheckConversationSatisfaction(user_role="user", llm_role="assistant"): return Check( name="conversation_satisfaction_score", - operators=[ConversationSatisfactionScore(user_role=user_role, llm_role=llm_role)], + operators=[ + ConversationSatisfactionScore(user_role=user_role, llm_role=llm_role) + ], plots=[Histogram(x="score_conversation_satisfaction")], ) @@ -172,6 +179,7 @@ def CheckConversationSatisfaction(user_role="user", llm_role="assistant"): # Custom Evaluations # ----------------------------------------------------------- + def CheckGuidelineAdherence( guideline, guideline_name="guideline", response_schema=None ): @@ -192,6 +200,7 @@ def CheckGuidelineAdherence( # Security # ----------------------------------------------------------- + def CheckPromptInjection(): return Check( name="prompt_injection_score", @@ -212,6 +221,7 @@ def CheckJailbreakDetection(): # Subquery # ----------------------------------------------------------- + def CheckSubQueryCompleteness(): return Check( name="sub_query_completeness_score", diff --git a/uptrain/framework/checks.py b/uptrain/framework/checks.py index 8c23176dc..292d5dfd6 100644 --- a/uptrain/framework/checks.py +++ b/uptrain/framework/checks.py @@ -1,5 +1,6 @@ """Implements `Check` objects used for LLM evaluation purposes. """ + from __future__ import annotations from dataclasses import dataclass import os @@ -9,7 +10,12 @@ import polars as pl from pydantic import BaseModel -from uptrain.operators.base import * +from uptrain.operators.base import ( + Operator, + TransformOp, + ColumnOp, + deserialize_operator, +) from uptrain.utilities import jsonload, jsondump, to_py_types, clear_directory from uptrain.framework.base import OperatorDAG, Settings diff --git a/uptrain/framework/evalllm.py b/uptrain/framework/evalllm.py index 416d0cc2b..1565cd09a 100644 --- a/uptrain/framework/evalllm.py +++ b/uptrain/framework/evalllm.py @@ -49,9 +49,7 @@ from uptrain.framework.rca_templates import RcaTemplate from uptrain.operators import RagWithCitation -RCA_TEMPLATE_TO_OPERATOR_MAPPING = { - RcaTemplate.RAG_WITH_CITATION: RagWithCitation() -} +RCA_TEMPLATE_TO_OPERATOR_MAPPING = {RcaTemplate.RAG_WITH_CITATION: RagWithCitation()} EVAL_TO_OPERATOR_MAPPING = { Evals.FACTUAL_ACCURACY: ResponseFactualScore(), @@ -88,6 +86,7 @@ def __init__(self, settings: Settings = None, openai_api_key: str = None) -> Non else: self.settings = settings self.executor = APIClientWithoutAuth(self.settings) + #### def perform_root_cause_analysis( self, @@ -115,7 +114,6 @@ def perform_root_cause_analysis( elif isinstance(data, pd.DataFrame): data = data.to_dict(orient="records") - if schema is None: schema = DataSchema() elif isinstance(schema, dict): @@ -124,7 +122,6 @@ def perform_root_cause_analysis( if metadata is None: metadata = {} - req_attrs, ser_template = set(), {} if rca_template == RcaTemplate.RAG_WITH_CITATION: req_attrs.update( @@ -152,9 +149,7 @@ def perform_root_cause_analysis( else scenario_description[idx] ) res = ( - op.setup(self.settings) - .run(pl.DataFrame(data))["output"] - .to_dicts() + op.setup(self.settings).run(pl.DataFrame(data))["output"].to_dicts() ) else: res = self.evaluate_on_server(data, [ser_template], schema) @@ -164,7 +159,6 @@ def perform_root_cause_analysis( results = self.evaluate_on_server(data, [ser_template], schema) return results - def evaluate( self, data: t.Union[list[dict], pl.DataFrame, pd.DataFrame], @@ -270,7 +264,11 @@ def evaluate( if self.settings.evaluate_locally: results = copy.deepcopy(data) for idx, check in enumerate(checks): - if isinstance(check, ParametricEval) and ser_checks[idx]["check_name"] in PARAMETRIC_EVAL_TO_OPERATOR_MAPPING: + if ( + isinstance(check, ParametricEval) + and ser_checks[idx]["check_name"] + in PARAMETRIC_EVAL_TO_OPERATOR_MAPPING + ): # Use the check_name field to get the operator and remove it from ser_checks op = PARAMETRIC_EVAL_TO_OPERATOR_MAPPING[ ser_checks[idx].pop("check_name") @@ -305,34 +303,31 @@ def evaluate( headers={"uptrain-access-token": "default_key"}, timeout=httpx.Timeout(7200, connect=5), ) - response = client.post( - url, - json={"name": "default_key"} - ) + response = client.post(url, json={"name": "default_key"}) - user_id = response.json()['id'] + user_id = response.json()["id"] checks = [] for res in results: row_check = {} for key in res: - if key.startswith('score') or key.startswith('explanation'): + if key.startswith("score") or key.startswith("explanation"): row_check.update({key: res[key]}) checks.append(row_check) - + url = "http://localhost:4300/api/public/add_project_data" response = client.post( - url, - json={ - "data": results, - "checks": checks, - "metadata": metadata, - "schema_dict": schema.dict(), - "project": project_name, - }, - ) - except: + url, + json={ + "data": results, + "checks": checks, + "metadata": metadata, + "schema_dict": schema.dict(), + "project": project_name, + }, + ) + except Exception: user_id = "default_key" - logger.info('Server is not running!') + logger.info("Server is not running!") return results def evaluate_on_server(self, data, ser_checks, schema): @@ -414,13 +409,12 @@ def evaluate_experiments( exp_results = exp_results.to_dicts() return exp_results - def evaluate_prompts( self, project_name: str, data: t.Union[list[dict], pl.DataFrame], checks: list[t.Union[str, Evals, ParametricEval]], - prompt: str, + prompt: str, schema: t.Union[DataSchema, dict[str, str], None] = None, metadata: t.Optional[dict[str, t.Any]] = None, ): @@ -440,10 +434,10 @@ def evaluate_prompts( """ if metadata is None: metadata = {} - + base_prompt, prompt_vars = parse_prompt(prompt) - prompts =[] + prompts = [] context_vars = {} context_vars.update(zip(prompt_vars, prompt_vars)) for idx, item in enumerate(data): @@ -453,18 +447,24 @@ def evaluate_prompts( model = metadata["model"] dataset = pl.DataFrame(data) - dataset = dataset.with_columns(pl.Series(name="model", values=[model] * len(dataset))) - dataset = dataset.with_columns(pl.Series(name="prompt", values= prompts)) - + dataset = dataset.with_columns( + pl.Series(name="model", values=[model] * len(dataset)) + ) + dataset = dataset.with_columns(pl.Series(name="prompt", values=prompts)) + from uptrain.operators import TextCompletion - - dataset = TextCompletion( - col_in_prompt = "prompt", - col_in_model = "model", - col_out_completion = "response", - temperature = 0.0 - ).setup(self.settings).run(dataset)['output'] - + + dataset = ( + TextCompletion( + col_in_prompt="prompt", + col_in_model="model", + col_out_completion="response", + temperature=0.0, + ) + .setup(self.settings) + .run(dataset)["output"] + ) + dataset = dataset.to_dicts() if schema is None: diff --git a/uptrain/framework/evals.py b/uptrain/framework/evals.py index f784a93a7..0434894fd 100644 --- a/uptrain/framework/evals.py +++ b/uptrain/framework/evals.py @@ -1,7 +1,8 @@ import enum -import pydantic import typing as t +from pydantic import BaseModel + class Evals(enum.Enum): CONTEXT_RELEVANCE = "context_relevance" @@ -22,8 +23,7 @@ class Evals(enum.Enum): CONTEXT_CONCISENESS = "context_conciseness" -class ParametricEval(pydantic.BaseModel): - ... +class ParametricEval(BaseModel): ... class CritiqueTone(ParametricEval): @@ -32,10 +32,12 @@ class CritiqueTone(ParametricEval): class GuidelineAdherence(ParametricEval): guideline: str - guideline_name: str = "guideline" # User-assigned name of the guideline to distinguish between multiple checks - response_schema: t.Union[ - str, None - ] = None # Schema of the response in case it is of type JSON, XML, etc. + guideline_name: str = ( + "guideline" # User-assigned name of the guideline to distinguish between multiple checks + ) + response_schema: t.Union[str, None] = ( + None # Schema of the response in case it is of type JSON, XML, etc. + ) class ConversationSatisfaction(ParametricEval): @@ -52,9 +54,9 @@ class CustomPromptEval(ParametricEval): list[float], list[int] ] # Scores associated for each choice. ex: [1.0, 0.0] eval_type: t.Literal["classify", "cot_classify"] = "cot_classify" - prompt_var_to_column_mapping: t.Union[ - dict[str, str], None - ] = None # Specify matching between variables in the evaluation prompt and keys in your data + prompt_var_to_column_mapping: t.Union[dict[str, str], None] = ( + None # Specify matching between variables in the evaluation prompt and keys in your data + ) class ResponseMatching(ParametricEval): @@ -62,4 +64,6 @@ class ResponseMatching(ParametricEval): class JailbreakDetection(ParametricEval): - model_purpose: str = "To help the users with their queries without providing them with any illegal, immoral or abusive content." + model_purpose: str = ( + "To help the users with their queries without providing them with any illegal, immoral or abusive content." + ) diff --git a/uptrain/framework/remote.py b/uptrain/framework/remote.py index 8642cb92b..0effd90ab 100644 --- a/uptrain/framework/remote.py +++ b/uptrain/framework/remote.py @@ -6,10 +6,10 @@ import typing as t from loguru import logger +from pydantic import BaseModel import httpx import polars as pl import pandas as pd -import pydantic from uptrain.framework.checks import CheckSet, ExperimentArgs from uptrain.framework.base import Settings @@ -26,7 +26,7 @@ from uptrain.utilities import polars_to_pandas -class DataSchema(pydantic.BaseModel): +class DataSchema(BaseModel): id_: str = "id" question: str = "question" response: str = "response" diff --git a/uptrain/operators/__init__.pyi b/uptrain/operators/__init__.pyi index c821a76f9..72a8a3133 100644 --- a/uptrain/operators/__init__.pyi +++ b/uptrain/operators/__init__.pyi @@ -161,9 +161,7 @@ from .language.response_quality import ( ResponseRelevance, ResponseMatchingScore, ) -from .language.question_quality import ( - ValidQuestionScore -) +from .language.question_quality import ValidQuestionScore from .language.language_quality import LanguageCritique, ResponseCoherence from .language.tone import ToneCritique from .language.guideline import GuidelineAdherenceScore diff --git a/uptrain/operators/base.py b/uptrain/operators/base.py index aaea2c51e..f1b8b6ac3 100644 --- a/uptrain/operators/base.py +++ b/uptrain/operators/base.py @@ -9,7 +9,7 @@ import typing_extensions as te from loguru import logger -from pydantic import BaseModel +from pydantic import ConfigDict, BaseModel import polars as pl if t.TYPE_CHECKING: @@ -98,10 +98,9 @@ class OpBaseModel(BaseModel): model, to get around some of the sharp edges. """ - class Config: - extra = "allow" - smart_union = True - underscore_attrs_are_private = True + # TODO[pydantic]: The following keys were removed: `smart_union`, `underscore_attrs_are_private`. + # Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-config for more information. + model_config = ConfigDict(extra="allow") class ColumnOp(OpBaseModel): diff --git a/uptrain/operators/chart.py b/uptrain/operators/chart.py index a15047826..67dfd2194 100644 --- a/uptrain/operators/chart.py +++ b/uptrain/operators/chart.py @@ -10,13 +10,16 @@ from __future__ import annotations import typing as t -from loguru import logger from pydantic import Field import polars as pl if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + OpBaseModel, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.utilities import lazy_load_dep, polars_to_pandas px = lazy_load_dep("plotly.express", "plotly>=5.0.0") @@ -175,7 +178,7 @@ class BarChart(Chart): barmode: str = "group" - kind = "bar" + kind: str = "bar" @register_op @@ -224,7 +227,7 @@ class LineChart(Chart): description: str = "" color: str = "" - kind = "line" + kind: str = "line" @register_op @@ -274,7 +277,7 @@ class ScatterPlot(Chart): color: str = "" symbol: str = "circle" - kind = "scatter" + kind: str = "scatter" @register_op @@ -327,7 +330,7 @@ class Scatter3DPlot(Chart): color: str = "" symbol: str = "" - kind = "scatter_3d" + kind: str = "scatter_3d" def setup(self, settings: Settings = None): super(Scatter3DPlot, self).setup() @@ -391,7 +394,7 @@ class Histogram(Chart): color: str = "" nbins: int = 20 - kind = "histogram" + kind: str = "histogram" @register_op @@ -461,10 +464,10 @@ class MultiPlot(Chart): description: str = "" charts: list - kind = "multiplot" + kind: str = "multiplot" def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: - if type(self.charts[0]) == dict: + if isinstance(self.charts[0], dict): self.charts = [Chart(**chart).setup() for chart in self.charts] fig = ps.make_subplots( @@ -483,9 +486,6 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: annotation_multi_height = ( -0.3 ) # Adjust this value for multiline annotation position - annotation_line_height = ( - -0.05 - ) # Adjust this value for multiline annotation spacing for idx, chart in enumerate(self.charts): plot = getattr(px, chart.kind)(polars_to_pandas(data), **chart.props) diff --git a/uptrain/operators/clustering.py b/uptrain/operators/clustering.py index 7d4452740..0c40a01aa 100644 --- a/uptrain/operators/clustering.py +++ b/uptrain/operators/clustering.py @@ -14,11 +14,14 @@ from loguru import logger import numpy as np import polars as pl -from pydantic import root_validator if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.utilities import lazy_load_dep nltk = lazy_load_dep("nltk", "nltk") diff --git a/uptrain/operators/code/detection.py b/uptrain/operators/code/detection.py index 0310b8bb2..8add2827c 100644 --- a/uptrain/operators/code/detection.py +++ b/uptrain/operators/code/detection.py @@ -9,19 +9,27 @@ from loguru import logger import polars as pl from uptrain.operators.language.llm import LLMMulticlient -from uptrain.operators.language.prompts.classic import CODE_HALLUCINATION_PROMPT_TEMPLATE -from uptrain.operators.language.prompts.few_shots import CODE_HALLUCINATION_FEW_SHOT__CLASSIFY, CODE_HALLUCINATION_FEW_SHOT__COT +from uptrain.operators.language.prompts.classic import ( + CODE_HALLUCINATION_PROMPT_TEMPLATE, +) +from uptrain.operators.language.prompts.few_shots import ( + CODE_HALLUCINATION_FEW_SHOT__CLASSIFY, + CODE_HALLUCINATION_FEW_SHOT__COT, +) from uptrain.operators.language.prompts.instructions import CHAIN_OF_THOUGHT, CLASSIFY -from uptrain.operators.language.prompts.output_format import CODE_HALLUCINATION_OUTPUT_FORMAT__CLASSIFY, CODE_HALLUCINATION_OUTPUT_FORMAT__COT +from uptrain.operators.language.prompts.output_format import ( + CODE_HALLUCINATION_OUTPUT_FORMAT__CLASSIFY, + CODE_HALLUCINATION_OUTPUT_FORMAT__COT, +) from uptrain.utilities.prompt_utils import parse_scenario_description - -from uptrain.framework.base import ( +from uptrain.operators.base import ( ColumnOp, register_op, TYPE_TABLE_OUTPUT, - Settings, ) +from uptrain.framework.base import Settings + from uptrain.utilities import polars_to_json_serializable_dict @@ -56,7 +64,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -70,7 +81,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["context"] = row.pop(self.col_context) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate("code_hallucination", data_send) diff --git a/uptrain/operators/code/sql.py b/uptrain/operators/code/sql.py index dd47263be..57c596a9d 100644 --- a/uptrain/operators/code/sql.py +++ b/uptrain/operators/code/sql.py @@ -6,7 +6,6 @@ import itertools import json -import os import typing as t from pydantic import BaseModel @@ -21,8 +20,12 @@ ) if t.TYPE_CHECKING: - from uptrain.framework.base import * -from uptrain.operators.base import * + from uptrain.framework.base import Settings +from uptrain.operators.base import ( + TransformOp, + register_op, + TYPE_TABLE_OUTPUT, +) sqlglot = lazy_load_dep("sqlglot", "sqlglot") diff --git a/uptrain/operators/drift.py b/uptrain/operators/drift.py index 25e9dbac9..d79e9a73d 100644 --- a/uptrain/operators/drift.py +++ b/uptrain/operators/drift.py @@ -20,7 +20,12 @@ if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + OpBaseModel, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.utilities import lazy_load_dep drift = lazy_load_dep("river.drift", "river") diff --git a/uptrain/operators/embedding/embedding.py b/uptrain/operators/embedding/embedding.py index 9d77a7714..7aaa9d742 100644 --- a/uptrain/operators/embedding/embedding.py +++ b/uptrain/operators/embedding/embedding.py @@ -18,7 +18,11 @@ if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.utilities import lazy_load_dep @@ -73,7 +77,9 @@ class Embedding(ColumnOp): """ - model: str = "" # t.Literal["MiniLM-L6-v2", "instructor-xl", "mpnet-base-v2", "bge-large-zh-v1.5", "instructor-large"] + model: str = ( + "" # t.Literal["MiniLM-L6-v2", "instructor-xl", "mpnet-base-v2", "bge-large-zh-v1.5", "instructor-large"] + ) col_in_text: str = "text" col_out: str = "embedding" batch_size: int = 128 @@ -183,7 +189,7 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: ).json()["data"] ] emb_length = len(run_res[0]) - except: + except Exception: run_res = [] for elem_idx in range(idx * BATCH_SIZE, (idx + 1) * BATCH_SIZE): if elem_idx < len(inputs): diff --git a/uptrain/operators/embedding/vector_search.py b/uptrain/operators/embedding/vector_search.py index f0c299263..819f295cc 100644 --- a/uptrain/operators/embedding/vector_search.py +++ b/uptrain/operators/embedding/vector_search.py @@ -7,12 +7,15 @@ import copy import numpy as np -from loguru import logger import polars as pl if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + TransformOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.operators.embedding.embedding import Embedding from uptrain.operators.io.base import JsonReader, CsvReader from uptrain.utilities import lazy_load_dep, polars_to_pandas diff --git a/uptrain/operators/embs.py b/uptrain/operators/embs.py index 52c9fc9e8..4d7f15406 100644 --- a/uptrain/operators/embs.py +++ b/uptrain/operators/embs.py @@ -15,11 +15,17 @@ from loguru import logger import numpy as np import polars as pl -from pydantic import root_validator +from pydantic import model_validator if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + TransformOp, + TYPE_TABLE_OUTPUT, + get_output_col_name_at, + register_op, +) from uptrain.utilities import lazy_load_dep umap = lazy_load_dep("umap", "umap-learn") @@ -92,7 +98,8 @@ class Distribution(TransformOp): col_in_groupby: list[str] col_out: list[str] | None = None - @root_validator(pre=True) + @model_validator(mode="before") + @classmethod def _check_cols(cls, values): """ Validator to check the validity of input and output column lists. diff --git a/uptrain/operators/io/base.py b/uptrain/operators/io/base.py index ce797496a..f7225c184 100644 --- a/uptrain/operators/io/base.py +++ b/uptrain/operators/io/base.py @@ -8,7 +8,12 @@ if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + TransformOp, + register_op, + TYPE_TABLE_OUTPUT, + OpBaseModel, +) from uptrain.utilities import lazy_load_dep # ----------------------------------------------------------- @@ -153,8 +158,6 @@ class DeltaWriter(OpBaseModel): columns: t.Optional[list[str]] = None def setup(self, settings: Settings): - dl = lazy_load_dep("deltatable", "deltalake>=0.9") - return self def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: diff --git a/uptrain/operators/io/bq.py b/uptrain/operators/io/bq.py index a5b29bfea..dd42c7711 100644 --- a/uptrain/operators/io/bq.py +++ b/uptrain/operators/io/bq.py @@ -8,7 +8,11 @@ if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + TransformOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.utilities import lazy_load_dep diff --git a/uptrain/operators/io/duck.py b/uptrain/operators/io/duck.py index f88c1e8e9..741c4052e 100644 --- a/uptrain/operators/io/duck.py +++ b/uptrain/operators/io/duck.py @@ -7,7 +7,11 @@ if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + TransformOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.utilities import lazy_load_dep diff --git a/uptrain/operators/io/excel.py b/uptrain/operators/io/excel.py index 6473a70e0..19ec1caa0 100644 --- a/uptrain/operators/io/excel.py +++ b/uptrain/operators/io/excel.py @@ -4,11 +4,14 @@ import typing as t import polars as pl -import deltalake as dl if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + TransformOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.utilities import lazy_load_dep xlsx2csv = lazy_load_dep("xlsx2csv", "xlsx2csv") diff --git a/uptrain/operators/io/mongodb.py b/uptrain/operators/io/mongodb.py index 77143b7e7..45c0cdef5 100644 --- a/uptrain/operators/io/mongodb.py +++ b/uptrain/operators/io/mongodb.py @@ -2,13 +2,16 @@ from __future__ import annotations import typing as t -import io import polars as pl if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + TransformOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.utilities import lazy_load_dep diff --git a/uptrain/operators/language/bleu.py b/uptrain/operators/language/bleu.py index b5c6fcec1..7d0faaa73 100644 --- a/uptrain/operators/language/bleu.py +++ b/uptrain/operators/language/bleu.py @@ -8,13 +8,16 @@ from __future__ import annotations import typing as t -from loguru import logger import polars as pl from uptrain.framework import Settings if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.utilities import lazy_load_dep # blue_score = lazy_load_dep("nltk.translate.bleu_score", "nltk") diff --git a/uptrain/operators/language/context_quality.py b/uptrain/operators/language/context_quality.py index ae372ccc4..197955ec9 100644 --- a/uptrain/operators/language/context_quality.py +++ b/uptrain/operators/language/context_quality.py @@ -78,7 +78,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -91,7 +94,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["context"] = row.pop(self.col_context) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( @@ -224,7 +230,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -238,7 +247,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["context"] = row.pop(self.col_context) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( @@ -338,9 +350,9 @@ def evaluate_local(self, data): json.loads(res.response.choices[0].message.content)["Choice"] ] output["score_response_completeness_wrt_context"] = float(score) - output[ - "explanation_response_completeness_wrt_context" - ] = res.response.choices[0].message.content + output["explanation_response_completeness_wrt_context"] = ( + res.response.choices[0].message.content + ) except Exception: logger.error( f"Error when processing payload at index {idx}: {res.error}" @@ -382,7 +394,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -396,7 +411,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["reranked_context"] = row.pop(self.col_reranked_context) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( @@ -530,7 +548,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -544,7 +565,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["concise_context"] = row.pop(self.col_concise_context) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( diff --git a/uptrain/operators/language/conversation.py b/uptrain/operators/language/conversation.py index eb1eb1037..ef0f29d89 100644 --- a/uptrain/operators/language/conversation.py +++ b/uptrain/operators/language/conversation.py @@ -67,7 +67,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -79,7 +82,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["conversation"] = row[self.col_conversation] try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( diff --git a/uptrain/operators/language/factual_accuracy.py b/uptrain/operators/language/factual_accuracy.py index 17c917470..0412e4047 100644 --- a/uptrain/operators/language/factual_accuracy.py +++ b/uptrain/operators/language/factual_accuracy.py @@ -70,7 +70,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -84,7 +87,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["context"] = row.pop(self.col_context) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( diff --git a/uptrain/operators/language/generation.py b/uptrain/operators/language/generation.py index 1bb17e40e..63dda82e9 100644 --- a/uptrain/operators/language/generation.py +++ b/uptrain/operators/language/generation.py @@ -15,7 +15,12 @@ if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + TransformOp, + register_op, + TYPE_TABLE_OUTPUT, + ColumnOp, +) from uptrain.operators.language.llm import LLMMulticlient, Payload @@ -82,7 +87,7 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: # TODO: Temp Fix for handling json in prompts. Permanent fix is to integrate langchain? try: prompt = row["template"].format(**fill) - except: + except Exception: prompt = row["template"] for k, v in fill.items(): prompt = prompt.replace("{{" + k + "}}", v) diff --git a/uptrain/operators/language/grammar.py b/uptrain/operators/language/grammar.py index e85305bed..48208ac73 100644 --- a/uptrain/operators/language/grammar.py +++ b/uptrain/operators/language/grammar.py @@ -15,7 +15,11 @@ if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.operators.language.llm import LLMMulticlient, Payload __all__ = ["GrammarScore"] diff --git a/uptrain/operators/language/guideline.py b/uptrain/operators/language/guideline.py index ae171ef97..b89af7cea 100644 --- a/uptrain/operators/language/guideline.py +++ b/uptrain/operators/language/guideline.py @@ -68,7 +68,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -81,7 +84,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["response"] = row.pop(self.col_response) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( diff --git a/uptrain/operators/language/jailbreak.py b/uptrain/operators/language/jailbreak.py index f95ee1f74..3cd208176 100644 --- a/uptrain/operators/language/jailbreak.py +++ b/uptrain/operators/language/jailbreak.py @@ -57,7 +57,9 @@ class JailbreakDetectionScore(ColumnOp): col_question: str = "question" col_out: str = "score_jailbreak_attempted" - model_purpose: str = "To help the users with their queries without providing them with any illegal, immoral or abusive content." + model_purpose: str = ( + "To help the users with their queries without providing them with any illegal, immoral or abusive content." + ) scenario_description: t.Optional[str] = None score_mapping: dict = {"A": 1.0, "B": 0.0} @@ -66,7 +68,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -78,7 +83,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["question"] = row.pop(self.col_question) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( @@ -214,7 +222,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -226,7 +237,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["question"] = row.pop(self.col_question) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( diff --git a/uptrain/operators/language/language_quality.py b/uptrain/operators/language/language_quality.py index 1fbba2358..25f307fa3 100644 --- a/uptrain/operators/language/language_quality.py +++ b/uptrain/operators/language/language_quality.py @@ -72,7 +72,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -84,7 +87,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["response"] = row.pop(self.col_response) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate("critique_language", data_send) @@ -212,7 +218,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -224,7 +233,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["response"] = row.pop(self.col_response) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate("critique_language", data_send) diff --git a/uptrain/operators/language/llm.py b/uptrain/operators/language/llm.py index 9c5f9e48b..4626b970b 100644 --- a/uptrain/operators/language/llm.py +++ b/uptrain/operators/language/llm.py @@ -8,13 +8,11 @@ import random import typing as t -from contextlib import suppress from loguru import logger from pydantic import BaseModel, Field if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * from uptrain.utilities import lazy_load_dep openai = lazy_load_dep("openai", "openai") @@ -25,6 +23,7 @@ from openai import AsyncOpenAI from openai import AsyncAzureOpenAI import openai +from aiolimiter import AsyncLimiter # import openai.error @@ -55,11 +54,11 @@ def run_validation(llm_output, validation_func): async def async_process_payload( payload: Payload, - rpm_limiter: aiolimiter.AsyncLimiter, - tpm_limiter: aiolimiter.AsyncLimiter, + rpm_limiter: AsyncLimiter, + tpm_limiter: AsyncLimiter, aclient: t.Union[AsyncOpenAI, AsyncAzureOpenAI, None], max_retries: int, - validate_func: function = None, + validate_func: t.Callable = None, ) -> Payload: messages = payload.data["messages"] total_chars = sum(len(msg["role"]) + len(msg["content"]) for msg in messages) @@ -90,9 +89,10 @@ async def async_process_payload( break except Exception as exc: logger.error(f"Error when sending request to LLM API: {exc}") - sleep_and_retry = (count < max_retries - 1) + sleep_and_retry = count < max_retries - 1 if aclient is not None: - if not ( isinstance( + if not ( + isinstance( exc, ( openai.APIConnectionError, @@ -106,11 +106,10 @@ async def async_process_payload( sleep_and_retry = False else: litellm = lazy_load_dep("litellm", "litellm") - if not ( isinstance( + if not ( + isinstance( exc, - ( - litellm.RateLimitError, - ), + (litellm.RateLimitError,), ) ): sleep_and_retry = False @@ -231,7 +230,7 @@ def make_payload( ) def fetch_responses( - self, input_payloads: list[Payload], validate_func: function = None + self, input_payloads: list[Payload], validate_func: t.Callable = None ) -> list[Payload]: try: return asyncio.run( @@ -251,17 +250,16 @@ def fetch_responses( input_payloads, validate_func=validate_func ), ).result() - except: + except Exception: logger.error(f"Caught an exception: {e}") - async def async_fetch_responses( self, input_payloads: list[Payload], - validate_func: function = None, + validate_func: t.Callable = None, ) -> list[Payload]: - rpm_limiter = aiolimiter.AsyncLimiter(self._rpm_limit, time_period=60) - tpm_limiter = aiolimiter.AsyncLimiter(self._tpm_limit, time_period=60) + rpm_limiter = AsyncLimiter(self._rpm_limit, time_period=60) + tpm_limiter = AsyncLimiter(self._tpm_limit, time_period=60) async_outputs = [ async_process_payload( data, diff --git a/uptrain/operators/language/meteor.py b/uptrain/operators/language/meteor.py index 3642b85e9..695c36a09 100644 --- a/uptrain/operators/language/meteor.py +++ b/uptrain/operators/language/meteor.py @@ -11,13 +11,16 @@ from __future__ import annotations import typing as t -from loguru import logger import polars as pl from uptrain.framework import Settings if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.utilities import lazy_load_dep nltk = lazy_load_dep("nltk", "nltk") diff --git a/uptrain/operators/language/model_grade.py b/uptrain/operators/language/model_grade.py index 020a44455..71e9fe403 100644 --- a/uptrain/operators/language/model_grade.py +++ b/uptrain/operators/language/model_grade.py @@ -4,7 +4,6 @@ from __future__ import annotations import typing as t -import os import copy import re @@ -15,7 +14,11 @@ if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.operators.language.llm import LLMMulticlient, Payload # from evals.elsuite.modelgraded.classify_utils import ( @@ -26,7 +29,6 @@ import logging import string -from typing import Any, Callable, Iterable, Optional, Union MATCH_FNS = { "include": lambda x, y: float(x in y), @@ -55,9 +57,9 @@ def get_choice_score( choice: str, - choice_strings: Iterable[str], - choice_scores: Optional[Union[dict[str, float], str]] = None, -) -> Optional[float]: + choice_strings: t.Iterable[str], + choice_scores: t.Optional[t.Union[dict[str, float], str]] = None, +) -> t.Optional[float]: if choice_scores is None: return None if choice_scores == "from_strings": @@ -68,7 +70,7 @@ def get_choice_score( return choice_scores[choice] -def choice_to_str(choice_strings: Iterable[str]) -> str: +def choice_to_str(choice_strings: t.Iterable[str]) -> str: """Return a string of choices, e.g. '"Yes" or "No" or "Maybe"'.""" return " or ".join(f'"{choice}"' for choice in choice_strings) @@ -77,8 +79,8 @@ def append_answer_prompt( prompt: list, eval_type: str, append_type: str = "as_content", - answer_prompt: Optional[list] = None, - choice_strings: Optional[Iterable[str]] = None, + answer_prompt: t.Optional[list] = None, + choice_strings: t.Optional[t.Iterable[str]] = None, ) -> list: """Append answer prompt to prompt.""" answer_prompt = ( @@ -160,9 +162,11 @@ def format_prompt( """Format a prompt with only necessary kwargs.""" # if any input kwargs is chat prompt, convert to text prompt kwargs = { - k: chat_prompt_to_text_prompt(v, for_completion=False) - if is_chat_prompt(v) - else v + k: ( + chat_prompt_to_text_prompt(v, for_completion=False) + if is_chat_prompt(v) + else v + ) for k, v in kwargs.items() } if is_chat_prompt(prompt): @@ -262,7 +266,7 @@ def setup(self, settings: Settings): self._api_client = LLMMulticlient(settings=settings) self._settings = settings self.model = settings.model.replace("azure/", "") - if not (self.eval_type in ["cot_classify", "tot_classify", "tot_score"]): + if self.eval_type not in ["cot_classify", "tot_classify", "tot_score"]: raise Exception( "Only eval_type: cot_classify and tot_classify is supported for model grading check" ) @@ -324,15 +328,15 @@ def get_choice_via_llm(self, text: str, grading_prompt_template: str) -> str: score = output_payload.response.choices[0].message.content float(score) return score - except: + except Exception: return str(0.0) def get_choice( self, text: str, eval_type: str, - match_fn: Union[str, Callable], - choice_strings: Iterable[str], + match_fn: t.Union[str, t.Callable], + choice_strings: t.Iterable[str], choice_scores: dict = {}, ) -> str: """Clean the answer string to a choice string to one of choice_strings. Return '__invalid__.' if no match.""" @@ -416,7 +420,7 @@ def get_choice( new_char = char + prev_char try: float(new_char) - except: + except Exception: break prev_char = new_char part_before_decimal = prev_char @@ -427,7 +431,7 @@ def get_choice( new_char = prev_char + char try: float(new_char) - except: + except Exception: break prev_char = new_char part_after_decimal = prev_char @@ -439,7 +443,7 @@ def get_choice( text, self.grading_prompt_template ) return str(choice) - except: + except Exception: return self.get_choice_via_llm( text, self.grading_prompt_template ) diff --git a/uptrain/operators/language/openai_evals.py b/uptrain/operators/language/openai_evals.py index c858537ec..bf6ff364c 100644 --- a/uptrain/operators/language/openai_evals.py +++ b/uptrain/operators/language/openai_evals.py @@ -4,18 +4,21 @@ import typing as t import uuid import itertools -import numpy as np import evals import evals.base import evals.record import evals.registry -from loguru import logger import polars as pl if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + TransformOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.utilities import to_py_types UPTRAIN_BASE_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/uptrain/operators/language/prompts/classic.py b/uptrain/operators/language/prompts/classic.py index 151ab4bc7..1a081d7df 100644 --- a/uptrain/operators/language/prompts/classic.py +++ b/uptrain/operators/language/prompts/classic.py @@ -448,8 +448,6 @@ """ - - # Code Hallucination CODE_HALLUCINATION_PROMPT_TEMPLATE = """ You are given a response generated from a chatbot. Please assess whether the given response includes any computer code (or CLI command) or not. If you do find a code/command, include the line number in which you found the code/command. @@ -473,4 +471,3 @@ Task data: [Response]: {response} """ - diff --git a/uptrain/operators/language/prompts/few_shots.py b/uptrain/operators/language/prompts/few_shots.py index 158c58b51..a56f34603 100644 --- a/uptrain/operators/language/prompts/few_shots.py +++ b/uptrain/operators/language/prompts/few_shots.py @@ -545,4 +545,4 @@ "Choice": "A", "Snippet": "SELECT * FROM hospitals WHERE name = \"St. Mary's Hospital\";" } -""" \ No newline at end of file +""" diff --git a/uptrain/operators/language/question_quality.py b/uptrain/operators/language/question_quality.py index 9da827306..fde4c6704 100644 --- a/uptrain/operators/language/question_quality.py +++ b/uptrain/operators/language/question_quality.py @@ -41,16 +41,20 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: try: for row in data_send: question = row.pop(self.col_question) - results.append({"score_valid_question": int(len(question.split(" ")) > self.words_threshold)}) + results.append( + { + "score_valid_question": int( + len(question.split(" ")) > self.words_threshold + ) + } + ) except Exception as e: logger.error(f"Failed to run evaluation for `ValidQuestionScore`: {e}") raise e - + assert results is not None return { "output": data.with_columns( - pl.from_dicts(results).rename( - {"score_valid_question": self.col_out} - ) + pl.from_dicts(results).rename({"score_valid_question": self.col_out}) ) - } \ No newline at end of file + } diff --git a/uptrain/operators/language/response_quality.py b/uptrain/operators/language/response_quality.py index 872a16328..4fb3b3b14 100644 --- a/uptrain/operators/language/response_quality.py +++ b/uptrain/operators/language/response_quality.py @@ -77,7 +77,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -90,7 +93,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["response"] = row.pop(self.col_response) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( @@ -223,7 +229,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -236,7 +245,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["response"] = row.pop(self.col_response) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( @@ -366,7 +378,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -378,7 +393,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["response"] = row.pop(self.col_response) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( @@ -507,7 +525,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -519,7 +540,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["response"] = row.pop(self.col_response) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( @@ -646,7 +670,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -658,7 +685,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["response"] = row.pop(self.col_response) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( @@ -775,7 +805,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): # TODO: Add support for local evaluation for all methods if self.method != "llm": raise Exception( @@ -796,7 +829,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["ground_truth"] = row.pop(self.col_ground_truth) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( @@ -834,24 +870,29 @@ def evaluate_local(self, data): Our methodology is based on the model grade evaluation introduced by openai evals. """ - data_precision = copy.deepcopy(pl.DataFrame(data)).rename({ - self.col_response: "response", - self.col_ground_truth: "context" - }) - data_recall = copy.deepcopy(pl.DataFrame(data)).rename({ - self.col_ground_truth: "response", - self.col_response: "context" - }) - eval_data = pl.concat([data_precision, data_recall.select(data_precision.columns)]) - - output = ResponseFactualScore( - col_question=self.col_question, - col_response="response", - col_context="context", - scenario_description=self.scenario_description, - ).setup(settings=self.settings).run(eval_data)["output"].to_dicts() - output_precision = output[0:len(data)] - output_recall = output[len(data):] + data_precision = copy.deepcopy(pl.DataFrame(data)).rename( + {self.col_response: "response", self.col_ground_truth: "context"} + ) + data_recall = copy.deepcopy(pl.DataFrame(data)).rename( + {self.col_ground_truth: "response", self.col_response: "context"} + ) + eval_data = pl.concat( + [data_precision, data_recall.select(data_precision.columns)] + ) + + output = ( + ResponseFactualScore( + col_question=self.col_question, + col_response="response", + col_context="context", + scenario_description=self.scenario_description, + ) + .setup(settings=self.settings) + .run(eval_data)["output"] + .to_dicts() + ) + output_precision = output[0 : len(data)] + output_recall = output[len(data) :] results = [] for combined_row in zip(output_precision, output_recall): @@ -877,7 +918,7 @@ def evaluate_local(self, data): if precision != 0 and recall != 0: output["score_response_matching"] = 4 * ( - (precision * recall) / (precision*3 + recall) + (precision * recall) / (precision * 3 + recall) ) else: output["score_response_matching"] = 0 diff --git a/uptrain/operators/language/rouge.py b/uptrain/operators/language/rouge.py index d39a70322..1913d7ccd 100644 --- a/uptrain/operators/language/rouge.py +++ b/uptrain/operators/language/rouge.py @@ -8,13 +8,16 @@ from __future__ import annotations import typing as t -from loguru import logger import polars as pl from uptrain.framework import Settings if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain.utilities import lazy_load_dep rouge_scorer = lazy_load_dep("rouge_score.rouge_scorer", "rouge_score") diff --git a/uptrain/operators/language/subquery.py b/uptrain/operators/language/subquery.py index d593e6de6..6784439f5 100644 --- a/uptrain/operators/language/subquery.py +++ b/uptrain/operators/language/subquery.py @@ -59,7 +59,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -72,7 +75,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["sub_questions"] = row.pop(self.col_sub_questions) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( diff --git a/uptrain/operators/language/text.py b/uptrain/operators/language/text.py index 6a7415837..10993ec2b 100644 --- a/uptrain/operators/language/text.py +++ b/uptrain/operators/language/text.py @@ -13,13 +13,16 @@ import typing as t from urllib.parse import urlparse -from loguru import logger import polars as pl from uptrain.framework import Settings if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + register_op, + TYPE_TABLE_OUTPUT, +) # TODO: Add support for versions without a minor version number (e.g., "v1") or without a patch version number (e.g., "v1.2") diff --git a/uptrain/operators/language/tone.py b/uptrain/operators/language/tone.py index b440ecfb0..160fed924 100644 --- a/uptrain/operators/language/tone.py +++ b/uptrain/operators/language/tone.py @@ -66,7 +66,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -78,7 +81,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["response"] = row.pop(self.col_response) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.evaluate( diff --git a/uptrain/operators/language/topic.py b/uptrain/operators/language/topic.py index d59c43c67..c0d66d1e2 100644 --- a/uptrain/operators/language/topic.py +++ b/uptrain/operators/language/topic.py @@ -9,19 +9,20 @@ from __future__ import annotations import typing as t -from loguru import logger import numpy as np import polars as pl if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * -from uptrain.utilities import lazy_load_dep +from uptrain.operators.base import ( + ColumnOp, + register_op, + TYPE_TABLE_OUTPUT, +) @register_op class TopicAssignmentviaCluster(ColumnOp): - """ Operator for assigning topics based on cluster assignments. Note, you should run Clustering operator before using this. diff --git a/uptrain/operators/metrics.py b/uptrain/operators/metrics.py index 129675966..27f7d3857 100644 --- a/uptrain/operators/metrics.py +++ b/uptrain/operators/metrics.py @@ -6,13 +6,16 @@ from __future__ import annotations import typing as t -from loguru import logger import numpy as np import polars as pl if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + register_op, + TYPE_TABLE_OUTPUT, +) @register_op diff --git a/uptrain/operators/rca/rag_with_citation.py b/uptrain/operators/rca/rag_with_citation.py index 550ada72a..14a8e01b5 100644 --- a/uptrain/operators/rca/rag_with_citation.py +++ b/uptrain/operators/rca/rag_with_citation.py @@ -12,18 +12,23 @@ if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + register_op, + TYPE_TABLE_OUTPUT, +) from uptrain import RcaTemplate from uptrain.utilities import polars_to_json_serializable_dict from uptrain.operators.language.llm import LLMMulticlient from uptrain.operators import ( - ValidQuestionScore, - ResponseFactualScore, - ContextRelevance, + ValidQuestionScore, + ResponseFactualScore, + ContextRelevance, ValidResponseScore, ) + @register_op class RagWithCitation(ColumnOp): """ @@ -51,7 +56,10 @@ def setup(self, settings: t.Optional[Settings] = None): assert settings is not None self.settings = settings - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): self._api_client = LLMMulticlient(settings) else: self._api_client = APIClient(settings) @@ -66,7 +74,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: row["cited_context"] = row.pop(self.col_cited_context) try: - if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)): + if self.settings.evaluate_locally and ( + self.settings.uptrain_access_token is None + or not len(self.settings.uptrain_access_token) + ): results = self.evaluate_local(data_send) else: results = self._api_client.perform_root_cause_analysis( @@ -84,39 +95,59 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT: assert results is not None return {"output": data.with_columns(pl.from_dicts(results))} - + def evaluate_local(self, data): - question_valid_scores = ValidQuestionScore( - col_question="question" - ).setup(settings=self.settings).run(pl.DataFrame(data))["output"].to_dicts() - - response_valid_scores = ValidResponseScore( - col_response="response" - ).setup(settings=self.settings).run(pl.DataFrame(data))["output"].to_dicts() - - context_relevance_scores = ContextRelevance( - col_question="question", - col_context="context" - ).setup(settings=self.settings).run(pl.DataFrame(data))["output"].to_dicts() - - factual_accuracy_scores = ResponseFactualScore( - col_question="question", - col_context="context", - col_response="response" - ).setup(settings=self.settings).run(pl.DataFrame(data))["output"].to_dicts() - - data_cited = copy.deepcopy(pl.DataFrame(data)).drop("context").rename({"cited_context": "context"}) - - cited_context_relevance_scores = ContextRelevance( - col_question="question", - col_context="context" - ).setup(settings=self.settings).run(data_cited)["output"].to_dicts() - - cited_factual_accuracy_scores = ResponseFactualScore( - col_question="question", - col_context="context", - col_response="response" - ).setup(settings=self.settings).run(data_cited)["output"].to_dicts() + question_valid_scores = ( + ValidQuestionScore(col_question="question") + .setup(settings=self.settings) + .run(pl.DataFrame(data))["output"] + .to_dicts() + ) + + response_valid_scores = ( + ValidResponseScore(col_response="response") + .setup(settings=self.settings) + .run(pl.DataFrame(data))["output"] + .to_dicts() + ) + + context_relevance_scores = ( + ContextRelevance(col_question="question", col_context="context") + .setup(settings=self.settings) + .run(pl.DataFrame(data))["output"] + .to_dicts() + ) + + factual_accuracy_scores = ( + ResponseFactualScore( + col_question="question", col_context="context", col_response="response" + ) + .setup(settings=self.settings) + .run(pl.DataFrame(data))["output"] + .to_dicts() + ) + + data_cited = ( + copy.deepcopy(pl.DataFrame(data)) + .drop("context") + .rename({"cited_context": "context"}) + ) + + cited_context_relevance_scores = ( + ContextRelevance(col_question="question", col_context="context") + .setup(settings=self.settings) + .run(data_cited)["output"] + .to_dicts() + ) + + cited_factual_accuracy_scores = ( + ResponseFactualScore( + col_question="question", col_context="context", col_response="response" + ) + .setup(settings=self.settings) + .run(data_cited)["output"] + .to_dicts() + ) results = [] @@ -125,23 +156,32 @@ def evaluate_local(self, data): this_row_error = None this_row_suggestion = None - question_completeness = question_valid_scores[idx]['score_valid_question'] - valid_response = response_valid_scores[idx]['score_valid_response'] - context_relevance = context_relevance_scores[idx]['score_context_relevance'] - factual_accuracy = factual_accuracy_scores[idx]['score_factual_accuracy'] - cited_relevance = cited_context_relevance_scores[idx]['score_context_relevance'] - cited_factual = cited_factual_accuracy_scores[idx]['score_factual_accuracy'] + question_completeness = question_valid_scores[idx]["score_valid_question"] + valid_response = response_valid_scores[idx]["score_valid_response"] + context_relevance = context_relevance_scores[idx]["score_context_relevance"] + factual_accuracy = factual_accuracy_scores[idx]["score_factual_accuracy"] + cited_relevance = cited_context_relevance_scores[idx][ + "score_context_relevance" + ] + cited_factual = cited_factual_accuracy_scores[idx]["score_factual_accuracy"] this_row_explanations = [ None, - response_valid_scores[idx]['explanation_valid_response'], - context_relevance_scores[idx]['explanation_context_relevance'], - factual_accuracy_scores[idx]['explanation_factual_accuracy'], - cited_context_relevance_scores[idx]['explanation_context_relevance'], - cited_factual_accuracy_scores[idx]['explanation_factual_accuracy'], + response_valid_scores[idx]["explanation_valid_response"], + context_relevance_scores[idx]["explanation_context_relevance"], + factual_accuracy_scores[idx]["explanation_factual_accuracy"], + cited_context_relevance_scores[idx]["explanation_context_relevance"], + cited_factual_accuracy_scores[idx]["explanation_factual_accuracy"], ] - this_row_scores = [question_completeness, valid_response, context_relevance, factual_accuracy, cited_relevance, cited_factual] + this_row_scores = [ + question_completeness, + valid_response, + context_relevance, + factual_accuracy, + cited_relevance, + cited_factual, + ] if question_completeness == 0: this_row_scores = [0, 0, 0, 0, 0, 0] @@ -151,7 +191,7 @@ def evaluate_local(self, data): "Default explanation as the question is incomplete", "Default explanation as the question is incomplete", "Default explanation as the question is incomplete", - "Default explanation as the question is incomplete" + "Default explanation as the question is incomplete", ] this_row_error = "Incomplete Question" this_row_suggestion = "Ask the user to provide a valid question. In case of an ongoing conversation, rewrite the question by taking previous messages into account." @@ -159,8 +199,8 @@ def evaluate_local(self, data): this_row_scores = [1, 0, context_relevance, 0, 0, 0] this_row_explanations = [ None, - response_valid_scores[idx]['explanation_valid_response'], - context_relevance_scores[idx]['explanation_context_relevance'], + response_valid_scores[idx]["explanation_valid_response"], + context_relevance_scores[idx]["explanation_context_relevance"], "Default explanation as the response doesn't contain any relevant information", "Default explanation as the response doesn't contain any relevant information", "Default explanation as the response doesn't contain any relevant information", @@ -169,7 +209,9 @@ def evaluate_local(self, data): this_row_error = "Response With No Information - Poor Retrieval" this_row_suggestion = "Context Retrieval Pipeline needs improvement" else: - this_row_error = "Response With No Information - Poor Context Utilization" + this_row_error = ( + "Response With No Information - Poor Context Utilization" + ) this_row_suggestion = "Add intermediary steps so as the LLM can better understand context and generate a valid response" elif context_relevance == 0: this_row_error = "Poor Retrieval" @@ -185,25 +227,26 @@ def evaluate_local(self, data): this_row_suggestion = "Add intermediary steps so as the LLM can better understand context and generate a complete response" else: this_row_error = "Others" - this_row_suggestion = "Please reach out to the UpTrain team for further brainstorming" - - results.append({ - "error_mode": this_row_error, - "error_resolution_suggestion": this_row_suggestion, - 'score_question_completeness': this_row_scores[0], - 'score_valid_response': this_row_scores[1], - 'explanation_valid_response': this_row_explanations[1], - 'score_context_relevance': this_row_scores[2], - 'explanation_context_relevance': this_row_explanations[2], - 'score_factual_accuracy': this_row_scores[3], - 'explanation_factual_accuracy': this_row_explanations[3], - 'score_cited_context_relevance': this_row_scores[4], - 'explanation_cited_context_relevance': this_row_explanations[4], - 'score_factual_accuracy_wrt_cited': this_row_scores[5], - 'explanation_factual_accuracy_wrt_cited': this_row_explanations[5], - }) - - return results + this_row_suggestion = ( + "Please reach out to the UpTrain team for further brainstorming" + ) + results.append( + { + "error_mode": this_row_error, + "error_resolution_suggestion": this_row_suggestion, + "score_question_completeness": this_row_scores[0], + "score_valid_response": this_row_scores[1], + "explanation_valid_response": this_row_explanations[1], + "score_context_relevance": this_row_scores[2], + "explanation_context_relevance": this_row_explanations[2], + "score_factual_accuracy": this_row_scores[3], + "explanation_factual_accuracy": this_row_explanations[3], + "score_cited_context_relevance": this_row_scores[4], + "explanation_cited_context_relevance": this_row_explanations[4], + "score_factual_accuracy_wrt_cited": this_row_scores[5], + "explanation_factual_accuracy_wrt_cited": this_row_explanations[5], + } + ) - \ No newline at end of file + return results diff --git a/uptrain/operators/similarity.py b/uptrain/operators/similarity.py index 752909682..2f451010a 100644 --- a/uptrain/operators/similarity.py +++ b/uptrain/operators/similarity.py @@ -9,12 +9,15 @@ import typing as t import numpy as np -from loguru import logger import polars as pl if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + register_op, + TYPE_TABLE_OUTPUT, +) @register_op diff --git a/uptrain/operators/table.py b/uptrain/operators/table.py index 4be99adef..50e8ec7a4 100644 --- a/uptrain/operators/table.py +++ b/uptrain/operators/table.py @@ -8,14 +8,17 @@ from __future__ import annotations import typing as t -import numpy as np -from loguru import logger from pydantic import Field import polars as pl if t.TYPE_CHECKING: from uptrain.framework import Settings -from uptrain.operators.base import * +from uptrain.operators.base import ( + ColumnOp, + OpBaseModel, + register_op, + TYPE_TABLE_OUTPUT, +) @register_op diff --git a/uptrain/utilities/__init__.py b/uptrain/utilities/__init__.py index 7616ac9bf..f5e5dbdcc 100644 --- a/uptrain/utilities/__init__.py +++ b/uptrain/utilities/__init__.py @@ -10,7 +10,7 @@ from lazy_loader import load as _lazy_load from loguru import logger -import pydantic +from pydantic import BaseModel import numpy as np # import numpy.typing as npt @@ -53,7 +53,7 @@ def to_py_types(obj: t.Any) -> t.Any: "op_name": getattr(obj, "_uptrain_op_name"), "params": obj.dict(include=set(obj.__fields__)), } - elif isinstance(obj, pydantic.BaseModel): + elif isinstance(obj, BaseModel): return obj.dict() # for numpy types @@ -151,11 +151,11 @@ def polars_to_json_serializable_dict(data: pl.DataFrame): try: json.dumps(data_dictn) - except: + except Exception: for key in list(data_dictn[0].keys()): try: json.dumps([x[key] for x in data_dictn]) - except: + except Exception: for row in data_dictn: del row[key] @@ -171,7 +171,7 @@ def polars_to_pandas(data: pl.DataFrame): try: pd_data = data.to_pandas() - except: + except Exception: # convert to python native types first and then to pandas logger.warning( "Error converting polars to pandas. Trying to convert to python native types first." @@ -279,7 +279,7 @@ def lazy_load_dep(import_name: str, package_name: str): """ try: spec = importlib.util.find_spec(import_name) - except: + except Exception: spec = None if spec is None: logger.warning( diff --git a/uptrain/utilities/app_schema.py b/uptrain/utilities/app_schema.py index c4bfbb260..bc9a2bda3 100644 --- a/uptrain/utilities/app_schema.py +++ b/uptrain/utilities/app_schema.py @@ -1,7 +1,5 @@ from __future__ import annotations -import datetime as dt -from enum import Enum import typing as t from pydantic import BaseModel @@ -34,6 +32,7 @@ class EvaluateV2(BaseModel): schema_dict: dict project: str + class EvaluateV3(BaseModel): model: str project_name: str @@ -50,4 +49,3 @@ class ProjectsList(BaseModel): class ProjectData(BaseModel): data: list[t.Any] project_name: str - \ No newline at end of file diff --git a/uptrain/utilities/db.py b/uptrain/utilities/db.py index c07d57c4b..5aa8cb04b 100644 --- a/uptrain/utilities/db.py +++ b/uptrain/utilities/db.py @@ -38,6 +38,7 @@ class ModelDataset(SQLBase): UniqueConstraint("user_id", "name", "version", name="uix_dataset"), ) + class ModelPrompt(SQLBase): __tablename__ = "prompts" @@ -52,6 +53,7 @@ class ModelPrompt(SQLBase): UniqueConstraint("user_id", "name", "version", name="uix_prompt"), ) + class ModelUser(SQLBase): __tablename__ = "users" diff --git a/uptrain/utilities/utils.py b/uptrain/utilities/utils.py index 15d4cc87b..de65b5dfe 100644 --- a/uptrain/utilities/utils.py +++ b/uptrain/utilities/utils.py @@ -11,47 +11,51 @@ from uptrain import Settings from uptrain import ( - Evals, - ResponseMatching, - GuidelineAdherence, - ConversationSatisfaction, + Evals, + ResponseMatching, + GuidelineAdherence, + ConversationSatisfaction, JailbreakDetection, - CritiqueTone + CritiqueTone, ) + def _get_fsspec_filesystem(database_path) -> fsspec.AbstractFileSystem: return DirFileSystem(database_path, auto_mkdir=True) + from uptrain.utilities import lazy_load_dep fsspec.config.conf["file"] = {"auto_mkdir": True} evals_mapping = { - "context_relevance" : Evals.CONTEXT_RELEVANCE, - "factual_accuracy" : Evals.FACTUAL_ACCURACY, - "response_relevance" : Evals.RESPONSE_RELEVANCE, - "critique_language" : Evals.CRITIQUE_LANGUAGE, - "response_completeness" : Evals.RESPONSE_COMPLETENESS, - "response_completeness_wrt_context" : Evals.RESPONSE_COMPLETENESS_WRT_CONTEXT, - "response_consistency" : Evals.RESPONSE_CONSISTENCY, - "response_conciseness" : Evals.RESPONSE_CONCISENESS, - "valid_response" : Evals.VALID_RESPONSE, + "context_relevance": Evals.CONTEXT_RELEVANCE, + "factual_accuracy": Evals.FACTUAL_ACCURACY, + "response_relevance": Evals.RESPONSE_RELEVANCE, + "critique_language": Evals.CRITIQUE_LANGUAGE, + "response_completeness": Evals.RESPONSE_COMPLETENESS, + "response_completeness_wrt_context": Evals.RESPONSE_COMPLETENESS_WRT_CONTEXT, + "response_consistency": Evals.RESPONSE_CONSISTENCY, + "response_conciseness": Evals.RESPONSE_CONCISENESS, + "valid_response": Evals.VALID_RESPONSE, "response_alignment_with_scenario": Evals.RESPONSE_ALIGNMENT_WITH_SCENARIO, - "response_sincerity_with_scenario" : Evals.RESPONSE_SINCERITY_WITH_SCENARIO, - "prompt_injection" : Evals.PROMPT_INJECTION, - "code_hallucination" : Evals.CODE_HALLUCINATION, - "sub_query_completeness" : Evals.SUB_QUERY_COMPLETENESS, - "context_reranking" : Evals.CONTEXT_RERANKING, - "context_conciseness ": Evals.CONTEXT_CONCISENESS + "response_sincerity_with_scenario": Evals.RESPONSE_SINCERITY_WITH_SCENARIO, + "prompt_injection": Evals.PROMPT_INJECTION, + "code_hallucination": Evals.CODE_HALLUCINATION, + "sub_query_completeness": Evals.SUB_QUERY_COMPLETENESS, + "context_reranking": Evals.CONTEXT_RERANKING, + "context_conciseness ": Evals.CONTEXT_CONCISENESS, } parametric_evals_mapping = { - "CritiqueTone" : CritiqueTone, - "GuidelineAdherence" : GuidelineAdherence, - "ConversationSatisfaction" : ConversationSatisfaction, - "ResponseMatching" : ResponseMatching, - "JailbreakDetection" : JailbreakDetection + "CritiqueTone": CritiqueTone, + "GuidelineAdherence": GuidelineAdherence, + "ConversationSatisfaction": ConversationSatisfaction, + "ResponseMatching": ResponseMatching, + "JailbreakDetection": JailbreakDetection, } + + def checks_mapping(check_name: str, params: dict = dict()): if check_name in evals_mapping: return evals_mapping[check_name] @@ -59,12 +63,14 @@ def checks_mapping(check_name: str, params: dict = dict()): return parametric_evals_mapping[check_name](**params) else: return None - + + def get_uuid(): import uuid return str(uuid.uuid4().hex) + def get_current_datetime(): return datetime.utcnow() @@ -74,6 +80,7 @@ def hash_string(s: str): return hashlib.sha256(s.encode()).hexdigest() + def create_dirs(path: str): dirs_to_create = [ os.path.join(path), @@ -84,6 +91,7 @@ def create_dirs(path: str): os.makedirs(_dir, exist_ok=True) return + def get_sqlite_utils_db(fpath: str): sqlite = lazy_load_dep("sqlite_utils", "sqlite_utils") import sqlite3 @@ -91,43 +99,46 @@ def get_sqlite_utils_db(fpath: str): conn = sqlite3.connect(fpath, check_same_thread=False) return sqlite.Database(conn) + def parse_prompt(prompt): prompt_vars = [] if prompt is not None and len(prompt): - if '{{' in prompt: + if "{{" in prompt: prompt_vars = [x.split("}}")[0] for x in prompt.split("{{")[1:]] for var in prompt_vars: prompt = prompt.replace("{{" + var + "}}", "{" + var + "}") - elif '{' in prompt: + elif "{" in prompt: prompt_vars = [x.split("}")[0] for x in prompt.split("{")[1:]] else: - prompt = '' + prompt = "" return prompt, prompt_vars + def convert_project_to_polars(project_data): dictn = [] for row in project_data: - data = row['data'] - data.update(row['checks']) - if 'uptrain_settings' in row['metadata']: - del row['metadata']['uptrain_settings'] - data.update(row['metadata']) - data.update({'project_name': row['project'], 'timestamp': row['timestamp']}) + data = row["data"] + data.update(row["checks"]) + if "uptrain_settings" in row["metadata"]: + del row["metadata"]["uptrain_settings"] + data.update(row["metadata"]) + data.update({"project_name": row["project"], "timestamp": row["timestamp"]}) dictn.append(data) return pl.DataFrame(dictn) + def convert_project_to_dicts(project_data): dictn = [] checks_mapping = {} for row in project_data: - data = row['data'] - #data.update(row['checks']) + data = row["data"] + # data.update(row['checks']) uuid_tag = get_uuid() - data.update({'uuid_tag': uuid_tag}) - checks_mapping[uuid_tag] = row['checks'] - if 'uptrain_settings' in row['metadata']: - del row['metadata']['uptrain_settings'] - data.update(row['metadata']) - data.update({'project_name': row['project'], 'timestamp': row['timestamp']}) + data.update({"uuid_tag": uuid_tag}) + checks_mapping[uuid_tag] = row["checks"] + if "uptrain_settings" in row["metadata"]: + del row["metadata"]["uptrain_settings"] + data.update(row["metadata"]) + data.update({"project_name": row["project"], "timestamp": row["timestamp"]}) dictn.append(data) return pl.DataFrame(dictn), checks_mapping diff --git a/uptrain/v0/core/classes/helpers/config_handler.py b/uptrain/v0/core/classes/helpers/config_handler.py index 27733e6c0..f4e95dd31 100644 --- a/uptrain/v0/core/classes/helpers/config_handler.py +++ b/uptrain/v0/core/classes/helpers/config_handler.py @@ -2,7 +2,7 @@ import typing from datetime import datetime -from pydantic import BaseModel +from pydantic import ConfigDict, BaseModel import numpy as np from uptrain.v0.constants import AnnotationMethod @@ -82,6 +82,4 @@ class Config(BaseModel): class GroundTruthArgs(BaseModel): gt: typing.Union[np.ndarray, list] id: typing.Union[np.ndarray, list] - - class Config: - arbitrary_types_allowed = True + model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/uptrain/v0/core/classes/monitors/data_drift.py b/uptrain/v0/core/classes/monitors/data_drift.py index ac55c1214..e713f0fef 100644 --- a/uptrain/v0/core/classes/monitors/data_drift.py +++ b/uptrain/v0/core/classes/monitors/data_drift.py @@ -118,9 +118,9 @@ def check(self, inputs, outputs, gts=None, extra_args={}): ] = uniq_indexs query_indexs[uniq_clusters] = uniq_indexs query_indexs = np.array(query_indexs, dtype=int) - self.bucket_labelling_info[ - "hover_vals_for_production_clusters" - ] = hover_measurable_vals[query_indexs] + self.bucket_labelling_info["hover_vals_for_production_clusters"] = ( + hover_measurable_vals[query_indexs] + ) self.prod_dist_counts_arr.append(self.prod_dist_counts.copy()) @@ -283,9 +283,9 @@ def base_is_data_interesting(self, inputs, outputs, gts=None, extra_args={}): is_interesting = np.logical_or(is_close, is_interesting) for lkdx in range(len(is_close)): if is_close[lkdx]: - reasons[ - lkdx - ] = "Lies_to_Low_Density_Regions_In_Training_Distribution" + reasons[lkdx] = ( + "Lies_to_Low_Density_Regions_In_Training_Distribution" + ) if len(self.outliers): dists_from_outliers = np.min( @@ -403,9 +403,9 @@ def bucket_reference_dataset(self): clustering_results["idxs_closest_to_cluster_centroids"].values() ) ] - self.bucket_labelling_info[ - "hover_vals_for_reference_clusters" - ] = all_hover_vals[hover_label_idxs] + self.bucket_labelling_info["hover_vals_for_reference_clusters"] = ( + all_hover_vals[hover_label_idxs] + ) self.prod_dist = np.zeros(self.ref_dist.shape) self.prod_dist_counts = np.zeros(self.ref_dist_counts.shape) diff --git a/uptrain/v0/core/classes/monitors/feature_drift.py b/uptrain/v0/core/classes/monitors/feature_drift.py index 509ed0b31..e9f2b80b0 100644 --- a/uptrain/v0/core/classes/monitors/feature_drift.py +++ b/uptrain/v0/core/classes/monitors/feature_drift.py @@ -77,7 +77,9 @@ def check(self, inputs, outputs, gts=None, extra_args={}): if psi > self.psi_threshold: alert = f"Feature Drift last detected at {self.all_count} for {feat_name} with PSI = {psi}" self.log_handler.add_alert( - f"Feature Drift Alert for {feat_name} 🚨", alert, self.dashboard_name + f"Feature Drift Alert for {feat_name} 🚨", + alert, + self.dashboard_name, ) self.feats = np.array([])