Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate uniprot info into initial prompt #396

Open
wants to merge 6 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions alphastats/gui/pages/06_LLM.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)
from alphastats.llm.llm_integration import LLMIntegration, MessageKeys, Models, Roles
from alphastats.llm.prompts import get_initial_prompt, get_system_message
from alphastats.llm.uniprot_utils import format_uniprot_annotation
from alphastats.plots.plot_utils import PlotlyObject

OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
Expand Down Expand Up @@ -205,6 +206,7 @@ def llm_config():
display_uniprot(
regulated_genes_dict,
st.session_state[StateKeys.DATASET]._feature_to_repr_map,
model_name=model_name,
disabled=llm_integration_set_for_model,
)

Expand All @@ -220,6 +222,18 @@ def llm_config():
# TODO: Regenerate initial prompt on reset
with st.expander("Initial prompt", expanded=True):
feature_to_repr_map = st.session_state[StateKeys.DATASET]._feature_to_repr_map
if st.session_state[StateKeys.INTEGRATE_UNIPROT]:
texts = [
format_uniprot_annotation(
st.session_state[StateKeys.ANNOTATION_STORE][feature],
fields=st.session_state[StateKeys.SELECTED_UNIPROT_FIELDS],
)
for feature in regulated_genes_dict
]
uniprot_info = f"{os.linesep}{os.linesep}".join(texts)
else:
uniprot_info = ""

initial_prompt = st.text_area(
"",
value=get_initial_prompt(
Expand All @@ -236,6 +250,7 @@ def llm_config():
st.session_state[StateKeys.SELECTED_GENES_DOWN],
)
),
uniprot_info,
),
height=200,
disabled=llm_integration_set_for_model,
Expand Down
48 changes: 37 additions & 11 deletions alphastats/gui/utils/llm_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import streamlit as st

from alphastats.gui.utils.ui_helper import DefaultStates, StateKeys
from alphastats.llm.llm_integration import LLMIntegration
from alphastats.llm.llm_integration import LLMIntegration, MessageKeys, Models
from alphastats.llm.uniprot_utils import (
ExtractedUniprotFields,
format_uniprot_annotation,
Expand All @@ -24,6 +24,9 @@ def protein_selector(df: pd.DataFrame, title: str, state_key: str) -> List[str]:
selected_proteins (List[str]): A list of selected proteins.
"""
st.write(title)
if len(df) == 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we move this check to 06_LLM.py? e.g. right after st.markdown("##### Genes of interest")? I feel here it's a bit hidden

st.markdown("No significant proteins.")
return []
c1, c2 = st.columns([1, 1])
if c1.button("Select all", help=f"Select all {title} for analysis"):
st.session_state[state_key] = df["Protein"].tolist()
Expand Down Expand Up @@ -183,10 +186,19 @@ def get_display_available_uniprot_info(regulated_features: list) -> dict:

# TODO: Write test for this display
@st.fragment
def display_uniprot(regulated_genes_dict, feature_to_repr_map, disabled=False):
def display_uniprot(
regulated_genes_dict,
feature_to_repr_map,
model_name: str = Models.OLLAMA_31_70B,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please don't specify a default here

*,
disabled=False,
):
"""Display the interface for selecting fields from UniProt information, including a preview of the selected fields."""
all_fields = ExtractedUniprotFields.get_values()
c1, c2, c3, c4 = st.columns((1, 1, 3, 1))
st.markdown(
"We reccomend to provide at least limited information from Uniprot for all proteins as part of the initial prompt to avoid misinterpretaiton of gene names or ids by the LLM. You can edit the selection of fields to include while chatting for on the fly demand for more information."
)
c1, c2, c3, c4, c5, c6 = st.columns((1, 1, 1, 1, 1, 1))
if c1.button("Select all"):
st.session_state[StateKeys.SELECTED_UNIPROT_FIELDS] = all_fields
st.rerun(scope="fragment")
Expand All @@ -198,21 +210,35 @@ def display_uniprot(regulated_genes_dict, feature_to_repr_map, disabled=False):
DefaultStates.SELECTED_UNIPROT_FIELDS.copy()
)
st.rerun(scope="fragment")
if c4.button(
"Integrate into initial prompt",
type="primary",
help="Not implemented yet, but will adjust the initial prompt to include the output from Uniprot already and the system message to avoid calling the tool function again for the genes included.",
):
st.toast("Not implemented yet.", icon="⚠️")
# TODO: Implement this
with c4:
texts = [
format_uniprot_annotation(
st.session_state[StateKeys.ANNOTATION_STORE].get(feature, {}),
fields=st.session_state[StateKeys.SELECTED_UNIPROT_FIELDS],
)
for feature in regulated_genes_dict
]
dummy_model = LLMIntegration(model_name, api_key="lorem", load_tools=False)
tokens = dummy_model.estimate_tokens(
Comment on lines +221 to +222
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a bit hacky ;-)
please refactor such that the estimate_tokens becomes static and you can just call LLMIntegration.estimate_tokens here:

    def estimate_tokens(
       model: str  = None, messages: List[Dict[str, str]], average_chars_per_token: float = 3.6
    ) -> float:
...

[{MessageKeys.CONTENT: text} for text in texts]
)
st.markdown(f"Total tokens: {tokens:.0f}")
with c5:
st.checkbox(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need the extra checkbox? Can I not just click "update prompt"? (set state[StateKeys.INTEGRATE_UNIPROT]=True after button click)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need a way to also revert to an initial prompt that does not contain the uniprot information. Uncheck the box, then update.

"Integrate into initial prompt",
help="If this is ticked and the initial prompt is updated, the Uniprot information will be included in the prompt and the instructions regarding uniprot will change to onl;y look up more information if explicitly asked to do so. Make sure that the total tokens are below the message limit of your LLM.",
key=StateKeys.INTEGRATE_UNIPROT,
disabled=disabled,
)
if c6.button("Update prompt", disabled=disabled):
st.rerun(scope="app")
c1, c2 = st.columns((1, 3))
with c1, st.expander("Show options", expanded=True):
selected_fields = []
for field in all_fields:
if st.checkbox(
field,
value=field in st.session_state[StateKeys.SELECTED_UNIPROT_FIELDS],
disabled=disabled,
):
selected_fields.append(field)
if set(selected_fields) != set(
Expand Down
4 changes: 4 additions & 0 deletions alphastats/gui/utils/ui_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ def init_session_state() -> None:
if StateKeys.MAX_TOKENS not in st.session_state:
st.session_state[StateKeys.MAX_TOKENS] = 10000

if StateKeys.INTEGRATE_UNIPROT not in st.session_state:
st.session_state[StateKeys.INTEGRATE_UNIPROT] = False


class StateKeys(metaclass=ConstantsClass):
USER_SESSION_ID = "user_session_id"
Expand All @@ -164,6 +167,7 @@ class StateKeys(metaclass=ConstantsClass):
SELECTED_GENES_DOWN = "selected_genes_down"
SELECTED_UNIPROT_FIELDS = "selected_uniprot_fields"
MAX_TOKENS = "max_tokens"
INTEGRATE_UNIPROT = "integrate_uniprot"

ORGANISM = "organism" # TODO this is essentially a constant

Expand Down
16 changes: 14 additions & 2 deletions alphastats/llm/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ def get_system_message(dataset: DataSet) -> str:

return (
f"You are an expert biologist and have extensive experience in molecular biology, medicine and biochemistry.{os.linesep}"
"A user will present you with data regarding proteins upregulated in certain cells "
"sourced from UniProt and abstracts from scientific publications. They seek your "
"A user will present you with data regarding proteins upregulated. They seek your "
"expertise in understanding the connections between these proteins and their potential role "
f"in disease genesis. {os.linesep}"
f"Provide a detailed and insightful, yet concise response based on the given information. Use formatting to make your response more human readable."
Expand All @@ -30,16 +29,29 @@ def get_initial_prompt(
parameter_dict: Dict[str, Any],
upregulated_genes: List[str],
downregulated_genes: List[str],
uniprot_info: str,
):
"""Get the initial prompt for the LLM model."""
group1 = parameter_dict["group1"]
group2 = parameter_dict["group2"]
column = parameter_dict["column"]
if uniprot_info:
uniprot_instructions = (
f"We have already retireved relevant information from Uniprot for these proteins:{os.linesep}{os.linesep}{uniprot_info}{os.linesep}{os.linesep}"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This (and other prompts) are using interpunctation quite sparsely. My knowledge may be outdated, but I learned that the more structured a prompt, the better. Shall we add some backticks or quoatation marks?

just an example:

===
Uniprot information for protein "VCL"
- protein name: `Vinculin (or Metavinculin)`
- entryType of this protein is `UniProtKB reviewed (Swiss-Prot)`.
- primaryAccession of this protein is `P18206`.
- secondaryAccessions of this protein is `Q16450, Q5SWX2, Q7Z3B8, Q8IXU7`.
- XXXXX is `Actin filament (F-actin)-binding protein involved in cell-matrix adhesion and cell-cell adhesion. Regulates cell-surface E-cadherin expression and potentiates mechanosensing by the E-cadherin complex. May also play important roles in cell morphology and locomotion`
===

===
Uniprot information for protein "XYZ"

instead of

The protein VCL is called Vinculin (or Metavinculin).
Uniprot information:
- entryType of this protein is UniProtKB reviewed (Swiss-Prot).
- primaryAccession of this protein is P18206.
- secondaryAccessions of this protein is Q16450, Q5SWX2, Q7Z3B8, Q8IXU7.
- Actin filament (F-actin)-binding protein involved in cell-matrix adhesion and cell-cell adhesion. Regulates cell-surface E-cadherin expression and potentiates mechanosensing by the E-cadherin complex. May also play important roles in cell morphology and locomotion

(maybe a different PR)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will consider this when I start the next PR on prompt engineering.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"proteins" or "genes"? :)

"This contains curated information you may not have encountered before, value it highly. "
"Only retrieve additional information from Uniprot if explicitly asked to do."
)
else:
uniprot_instructions = (
"You have the ability to retrieve curated information from Uniprot about these proteins. "
"Please do so for individual proteins if you have little information about a protein or find a protein particularly important in the specific context."
)
return (
f"We've recently identified several proteins that appear to be differently regulated in cells "
f"when comparing {group1} and {group2} in the {column} group. "
f"From our proteomics experiments, we know that the following ones are upregulated: {', '.join(upregulated_genes)}.{os.linesep}{os.linesep}"
f"Here is the list of proteins that are downregulated: {', '.join(downregulated_genes)}.{os.linesep}{os.linesep}"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also prompt engineering:

Here is a comma-separated list of proteins that are downregulated: `{', '.join(downregulated_genes)}.{os.linesep}{os.linesep}`

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will consider this when I start the next PR on prompt engineering.

f"{uniprot_instructions}{os.linesep}{os.linesep}"
f"Help us understand the potential connections between these proteins and how they might be contributing "
f"to the differences. After that provide a high level summary"
)
Expand Down
Loading