MannLabs · JuliaS92 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/alphastats/gui/pages/06_LLM.py b/alphastats/gui/pages/06_LLM.py
@@ -26,6 +26,7 @@
 )
 from alphastats.llm.llm_integration import LLMIntegration, MessageKeys, Models, Roles
 from alphastats.llm.prompts import get_initial_prompt, get_system_message
+from alphastats.llm.uniprot_utils import format_uniprot_annotation
 from alphastats.plots.plot_utils import PlotlyObject
 
 OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
@@ -205,6 +206,7 @@ def llm_config():
 display_uniprot(
     regulated_genes_dict,
     st.session_state[StateKeys.DATASET]._feature_to_repr_map,
+    model_name=model_name,
     disabled=llm_integration_set_for_model,
 )
 
@@ -220,6 +222,18 @@ def llm_config():
 # TODO: Regenerate initial prompt on reset
 with st.expander("Initial prompt", expanded=True):
     feature_to_repr_map = st.session_state[StateKeys.DATASET]._feature_to_repr_map
+    if st.session_state[StateKeys.INTEGRATE_UNIPROT]:
+        texts = [
+            format_uniprot_annotation(
+                st.session_state[StateKeys.ANNOTATION_STORE][feature],
+                fields=st.session_state[StateKeys.SELECTED_UNIPROT_FIELDS],
+            )
+            for feature in regulated_genes_dict
+        ]
+        uniprot_info = f"{os.linesep}{os.linesep}".join(texts)
+    else:
+        uniprot_info = ""
+
     initial_prompt = st.text_area(
         "",
         value=get_initial_prompt(
@@ -236,6 +250,7 @@ def llm_config():
                     st.session_state[StateKeys.SELECTED_GENES_DOWN],
                 )
             ),
+            uniprot_info,
         ),
         height=200,
         disabled=llm_integration_set_for_model,

diff --git a/alphastats/gui/utils/llm_helper.py b/alphastats/gui/utils/llm_helper.py
@@ -5,7 +5,7 @@
 import streamlit as st
 
 from alphastats.gui.utils.ui_helper import DefaultStates, StateKeys
-from alphastats.llm.llm_integration import LLMIntegration
+from alphastats.llm.llm_integration import LLMIntegration, MessageKeys, Models
 from alphastats.llm.uniprot_utils import (
     ExtractedUniprotFields,
     format_uniprot_annotation,
@@ -24,6 +24,9 @@ def protein_selector(df: pd.DataFrame, title: str, state_key: str) -> List[str]:
         selected_proteins (List[str]): A list of selected proteins.
     """
     st.write(title)
+    if len(df) == 0:
+        st.markdown("No significant proteins.")
+        return []
     c1, c2 = st.columns([1, 1])
     if c1.button("Select all", help=f"Select all {title} for analysis"):
         st.session_state[state_key] = df["Protein"].tolist()
@@ -183,10 +186,19 @@ def get_display_available_uniprot_info(regulated_features: list) -> dict:
 
 # TODO: Write test for this display
 @st.fragment
-def display_uniprot(regulated_genes_dict, feature_to_repr_map, disabled=False):
+def display_uniprot(
+    regulated_genes_dict,
+    feature_to_repr_map,
+    model_name: str = Models.OLLAMA_31_70B,
+    *,
+    disabled=False,
+):
     """Display the interface for selecting fields from UniProt information, including a preview of the selected fields."""
     all_fields = ExtractedUniprotFields.get_values()
-    c1, c2, c3, c4 = st.columns((1, 1, 3, 1))
+    st.markdown(
+        "We reccomend to provide at least limited information from Uniprot for all proteins as part of the initial prompt to avoid misinterpretaiton of gene names or ids by the LLM. You can edit the selection of fields to include while chatting for on the fly demand for more information."
+    )
+    c1, c2, c3, c4, c5, c6 = st.columns((1, 1, 1, 1, 1, 1))
     if c1.button("Select all"):
         st.session_state[StateKeys.SELECTED_UNIPROT_FIELDS] = all_fields
         st.rerun(scope="fragment")
@@ -198,21 +210,35 @@ def display_uniprot(regulated_genes_dict, feature_to_repr_map, disabled=False):
             DefaultStates.SELECTED_UNIPROT_FIELDS.copy()
         )
         st.rerun(scope="fragment")
-    if c4.button(
-        "Integrate into initial prompt",
-        type="primary",
-        help="Not implemented yet, but will adjust the initial prompt to include the output from Uniprot already and the system message to avoid calling the tool function again for the genes included.",
-    ):
-        st.toast("Not implemented yet.", icon="⚠️")
-        # TODO: Implement this
+    with c4:
+        texts = [
+            format_uniprot_annotation(
+                st.session_state[StateKeys.ANNOTATION_STORE].get(feature, {}),
+                fields=st.session_state[StateKeys.SELECTED_UNIPROT_FIELDS],
+            )
+            for feature in regulated_genes_dict
+        ]
+        dummy_model = LLMIntegration(model_name, api_key="lorem", load_tools=False)
+        tokens = dummy_model.estimate_tokens(
+            [{MessageKeys.CONTENT: text} for text in texts]
+        )
+        st.markdown(f"Total tokens: {tokens:.0f}")
+    with c5:
+        st.checkbox(
+            "Integrate into initial prompt",
+            help="If this is ticked and the initial prompt is updated, the Uniprot information will be included in the prompt and the instructions regarding uniprot will change to onl;y look up more information if explicitly asked to do so. Make sure that the total tokens are below the message limit of your LLM.",
+            key=StateKeys.INTEGRATE_UNIPROT,
+            disabled=disabled,
+        )
+    if c6.button("Update prompt", disabled=disabled):
+        st.rerun(scope="app")
     c1, c2 = st.columns((1, 3))
     with c1, st.expander("Show options", expanded=True):
         selected_fields = []
         for field in all_fields:
             if st.checkbox(
                 field,
                 value=field in st.session_state[StateKeys.SELECTED_UNIPROT_FIELDS],
-                disabled=disabled,
             ):
                 selected_fields.append(field)
         if set(selected_fields) != set(

diff --git a/alphastats/gui/utils/ui_helper.py b/alphastats/gui/utils/ui_helper.py
@@ -145,6 +145,9 @@ def init_session_state() -> None:
     if StateKeys.MAX_TOKENS not in st.session_state:
         st.session_state[StateKeys.MAX_TOKENS] = 10000
 
+    if StateKeys.INTEGRATE_UNIPROT not in st.session_state:
+        st.session_state[StateKeys.INTEGRATE_UNIPROT] = False
+
 
 class StateKeys(metaclass=ConstantsClass):
     USER_SESSION_ID = "user_session_id"
@@ -164,6 +167,7 @@ class StateKeys(metaclass=ConstantsClass):
     SELECTED_GENES_DOWN = "selected_genes_down"
     SELECTED_UNIPROT_FIELDS = "selected_uniprot_fields"
     MAX_TOKENS = "max_tokens"
+    INTEGRATE_UNIPROT = "integrate_uniprot"
 
     ORGANISM = "organism"  # TODO this is essentially a constant
 

diff --git a/alphastats/llm/prompts.py b/alphastats/llm/prompts.py
@@ -15,8 +15,7 @@ def get_system_message(dataset: DataSet) -> str:
 
     return (
         f"You are an expert biologist and have extensive experience in molecular biology, medicine and biochemistry.{os.linesep}"
-        "A user will present you with data regarding proteins upregulated in certain cells "
-        "sourced from UniProt and abstracts from scientific publications. They seek your "
+        "A user will present you with data regarding proteins upregulated. They seek your "
         "expertise in understanding the connections between these proteins and their potential role "
         f"in disease genesis. {os.linesep}"
         f"Provide a detailed and insightful, yet concise response based on the given information. Use formatting to make your response more human readable."
@@ -30,16 +29,29 @@ def get_initial_prompt(
     parameter_dict: Dict[str, Any],
     upregulated_genes: List[str],
     downregulated_genes: List[str],
+    uniprot_info: str,
 ):
     """Get the initial prompt for the LLM model."""
     group1 = parameter_dict["group1"]
     group2 = parameter_dict["group2"]
     column = parameter_dict["column"]
+    if uniprot_info:
+        uniprot_instructions = (
+            f"We have already retireved relevant information from Uniprot for these proteins:{os.linesep}{os.linesep}{uniprot_info}{os.linesep}{os.linesep}"
+            "This contains curated information you may not have encountered before, value it highly. "
+            "Only retrieve additional information from Uniprot if explicitly asked to do."
+        )
+    else:
+        uniprot_instructions = (
+            "You have the ability to retrieve curated information from Uniprot about these proteins. "
+            "Please do so for individual proteins if you have little information about a protein or find a protein particularly important in the specific context."
+        )
     return (
         f"We've recently identified several proteins that appear to be differently regulated in cells "
         f"when comparing {group1} and {group2} in the {column} group. "
         f"From our proteomics experiments, we know that the following ones are upregulated: {', '.join(upregulated_genes)}.{os.linesep}{os.linesep}"
         f"Here is the list of proteins that are downregulated: {', '.join(downregulated_genes)}.{os.linesep}{os.linesep}"
+        f"{uniprot_instructions}{os.linesep}{os.linesep}"
         f"Help us understand the potential connections between these proteins and how they might be contributing "
         f"to the differences. After that provide a high level summary"
     )