Skip to content

Commit

Permalink
ModelingLLM: Add Structured Grading Instruction Generation and Restru…
Browse files Browse the repository at this point in the history
…cture Module (#340)
  • Loading branch information
LeonWehrhahn authored Sep 23, 2024
1 parent bbbcbc9 commit d9ff3bd
Show file tree
Hide file tree
Showing 33 changed files with 542 additions and 411 deletions.
5 changes: 3 additions & 2 deletions athena/athena/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from . import contextvars
from .app import app
from .schemas import ExerciseType, GradingCriterion, StructuredGradingInstruction
from .schemas import ExerciseType, GradingCriterion, StructuredGradingInstruction, StructuredGradingCriterion
from .metadata import emit_meta, get_meta
from .experiment import get_experiment_environment
from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider, evaluation_provider # type: ignore
Expand Down Expand Up @@ -36,5 +36,6 @@ def run_module():
"get_experiment_environment",
"ExerciseType",
"GradingCriterion",
"StructuredGradingInstruction"
"StructuredGradingInstruction",
"StructuredGradingCriterion"
]
2 changes: 1 addition & 1 deletion athena/athena/schemas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
from .modeling_feedback import ModelingFeedback
from .modeling_exercise import ModelingExercise
from .modeling_submission import ModelingSubmission
from .grading_criterion import GradingCriterion, StructuredGradingInstruction
from .grading_criterion import GradingCriterion, StructuredGradingInstruction, StructuredGradingCriterion
5 changes: 4 additions & 1 deletion athena/athena/schemas/grading_criterion.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from abc import ABC
from typing import List, Optional

from pydantic import Field
from pydantic import BaseModel, Field

from .schema import Schema

Expand All @@ -24,3 +24,6 @@ class GradingCriterion(Schema, ABC):
structured_grading_instructions: List[StructuredGradingInstruction] = Field(
[], example=[{"credits": 1.0, "gradingScale": "Good", "instructionDescription": "Some instructions", "feedback": "Nicely done!", "usageCount": 1},
{"credits": 0.0, "gradingScale": "Bad", "instructionDescription": "Some instructions", "feedback": "Try again!", "usageCount": 0}])

class StructuredGradingCriterion(BaseModel):
criteria: List[GradingCriterion]
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,13 @@

from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider
from athena.logger import logger
from athena.modeling import Exercise, Submission, Feedback
from athena.modeling import Exercise, Feedback, Submission
from module_modeling_llm.config import Configuration
from module_modeling_llm.generate_suggestions import generate_suggestions
from module_modeling_llm.core.filter_feedback import filter_feedback
from module_modeling_llm.core.generate_suggestions import generate_suggestions
from module_modeling_llm.core.get_structured_grading_instructions import get_structured_grading_instructions
from module_modeling_llm.utils.convert_to_athana_feedback_model import convert_to_athana_feedback_model
from module_modeling_llm.utils.get_exercise_model import get_exercise_model


@submissions_consumer
Expand All @@ -31,7 +35,26 @@ def process_incoming_feedback(exercise: Exercise, submission: Submission, feedba
async def suggest_feedback(exercise: Exercise, submission: Submission, is_graded: bool, module_config: Configuration) -> List[Feedback]:
logger.info("suggest_feedback: Suggestions for submission %d of exercise %d were requested", submission.id,
exercise.id)
return await generate_suggestions(exercise, submission, is_graded, module_config.approach, module_config.debug)

# First, we convert the incoming exercise and submission to our internal models and textual representations
exercise_model = get_exercise_model(exercise, submission)

# Next, we retrieve or generate the structured grading instructions for the exercise
structured_grading_instructions = await get_structured_grading_instructions(
exercise_model, module_config.approach, exercise.grading_instructions, exercise.grading_criteria, module_config.debug
)

# Finally, we generate feedback suggestions for the submission
feedback = await generate_suggestions(
exercise_model, structured_grading_instructions, module_config.approach, module_config.debug
)

# If the submission is not graded (Student is requesting feedback), we reformulate the feedback to not give away the solution
if is_graded is False:
feedback = await filter_feedback(exercise_model, feedback, module_config.approach, module_config.debug)

return convert_to_athana_feedback_model(feedback, exercise_model)



if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from typing import Optional
from module_modeling_llm.helpers.serializers.parser.uml_parser import UMLParser
import json

from module_modeling_llm.apollon_transformer.parser.uml_parser import UMLParser

class DiagramModelSerializer:

class ApollonJSONTransformer:

@staticmethod
def serialize_model(model: dict) -> tuple[Optional[str], dict[str, str]]:
def transform_json(model: str) -> tuple[str, dict[str, str], str]:
"""
Serialize a given Apollon diagram model to a string representation.
This method converts the UML diagram model into a format similar to mermaid syntax, called "apollon".
Expand All @@ -14,7 +15,12 @@ def serialize_model(model: dict) -> tuple[Optional[str], dict[str, str]]:
:return: A tuple containing the serialized model as a string and a dictionary mapping element and relation names
to their corresponding IDs.
"""
parser = UMLParser(model)

model_dict = json.loads(model)

parser = UMLParser(model_dict)

diagram_type = model_dict.get("type", "unknown")

# Convert the UML diagram to the apollon representation
apollon_representation = parser.to_apollon()
Expand All @@ -25,5 +31,5 @@ def serialize_model(model: dict) -> tuple[Optional[str], dict[str, str]]:
**{relation['name']: relation['id'] for relation in parser.get_relations()}
}

return apollon_representation, names
return apollon_representation, names, diagram_type

Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,6 @@ def resolve_references(self, element_dict: Dict[str, Any]):
self.attributes = [element_dict[ref].get("name", "") for ref in self.attribute_refs if ref in element_dict]
self.methods = [element_dict[ref].get('name', '') for ref in self.method_refs if ref in element_dict]

for ref_list, target_list in [(self.attribute_refs, self.attributes), (self.method_refs, self.methods)]:
target_list.extend(
element_dict.get(ref, {}).get("name", "") for ref in ref_list if ref in element_dict
)

def to_apollon(self) -> str:
parts = [f"[{self.type}] {self.name}"]

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from typing import Dict, Any, List
from string import ascii_uppercase

from module_modeling_llm.helpers.serializers.parser.element import Element
from module_modeling_llm.helpers.serializers.parser.relation import Relation
from module_modeling_llm.apollon_transformer.parser.element import Element
from module_modeling_llm.apollon_transformer.parser.relation import Relation


class UMLParser:
Expand Down Expand Up @@ -42,9 +42,14 @@ def _parse(self) -> None:
for element_data in self.data['elements'].values():
if element_data.get('id') not in referenced_ids:
name = element_data.get('name')
if name_count[name] > 1:
suffix_index = name_suffix_counters[name]
element_data['name'] = f"{name}{ascii_uppercase[suffix_index]}"
suffix_index = name_suffix_counters[name]

if name == '':
element_data['name'] = f"##{ascii_uppercase[suffix_index]}"
if name_count[name] > 1:
name_suffix_counters[name] += 1
elif name_count[name] > 1:
element_data['name'] = f"{name}#{ascii_uppercase[suffix_index]}"
name_suffix_counters[name] += 1

element = Element(element_data, self.data['elements'])
Expand Down
49 changes: 30 additions & 19 deletions modules/modeling/module_modeling_llm/module_modeling_llm/config.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
from pydantic import BaseModel, Field

from athena import config_schema_provider
from module_modeling_llm.helpers.models import ModelConfigType, DefaultModelConfig
from module_modeling_llm.prompts.generate_suggestions import (
graded_feedback_system_message as default_graded_feedback_system_message,
graded_feedback_human_message as default_graded_feedback_human_message,
filter_feedback_system_message as default_filter_feedback_system_message,
filter_feedback_human_message as default_filter_feedback_human_message
from module_modeling_llm.models import ModelConfigType, DefaultModelConfig
from module_modeling_llm.prompts import (
graded_feedback_prompt,
filter_feedback_prompt,
structured_grading_instructions_prompt
)


class GenerateSuggestionsPrompt(BaseModel):
"""
Features available: **{problem_statement}**, **{example_solution}**, **{grading_instructions}**, **{max_points}**,
Expand All @@ -18,25 +16,38 @@ class GenerateSuggestionsPrompt(BaseModel):
_Note: **{problem_statement}**, **{example_solution}**, or **{grading_instructions}** might be omitted if the input
is too long._
"""
graded_feedback_system_message: str = Field(default=default_graded_feedback_system_message,
description="Message for priming AI behavior and instructing it what to do.")
graded_feedback_human_message: str = Field(default=default_graded_feedback_human_message,
description="Message from a human. The input on which the AI is supposed to act.")
filter_feedback_system_message: str = Field(default=default_filter_feedback_system_message,
description="Message for priming AI behavior for filtering ungraded feedback.")
filter_feedback_human_message: str = Field(default=default_filter_feedback_human_message,
description="Message for instructing AI to filter ungraded feedback.")


graded_feedback_system_message: str = Field(
default=graded_feedback_prompt.graded_feedback_system_message,
description="Message for priming AI behavior and instructing it what to do."
)
graded_feedback_human_message: str = Field(
default=graded_feedback_prompt.graded_feedback_human_message,
description="Message from a human. The input on which the AI is supposed to act."
)
filter_feedback_system_message: str = Field(
default=filter_feedback_prompt.filter_feedback_system_message,
description="Message for priming AI behavior for filtering ungraded feedback."
)
filter_feedback_human_message: str = Field(
default=filter_feedback_prompt.filter_feedback_human_message,
description="Message for instructing AI to filter ungraded feedback."
)
structured_grading_instructions_system_message: str = Field(
default=structured_grading_instructions_prompt.structured_grading_instructions_system_message,
description="Message for instructing AI to structure the Problem Statement"
)
structured_grading_instructions_human_message: str = Field(
default=structured_grading_instructions_prompt.structured_grading_instructions_human_message,
description="Message for instructing AI to filter ungraded feedback."
)

class BasicApproachConfig(BaseModel):
"""This approach uses a LLM with a single prompt to generate feedback in a single step."""
max_input_tokens: int = Field(default=3000, description="Maximum number of tokens in the input prompt.")
model: ModelConfigType = Field(default=DefaultModelConfig()) # type: ignore
generate_suggestions_prompt: GenerateSuggestionsPrompt = Field(default=GenerateSuggestionsPrompt())


@config_schema_provider
class Configuration(BaseModel):
debug: bool = Field(default=False, description="Enable debug mode.")
approach: BasicApproachConfig = Field(default=BasicApproachConfig())
approach: BasicApproachConfig = Field(default=BasicApproachConfig())
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate

from athena import emit_meta
from module_modeling_llm.config import BasicApproachConfig
from module_modeling_llm.utils.predict_and_parse import predict_and_parse
from module_modeling_llm.models.assessment_model import AssessmentModel
from module_modeling_llm.models.exercise_model import ExerciseModel
from module_modeling_llm.prompts.filter_feedback_prompt import FilterFeedbackInputs

async def filter_feedback(
exercise: ExerciseModel,
original_feedback: AssessmentModel,
config: BasicApproachConfig,
debug: bool,
) -> AssessmentModel:

print(f"\n\n\n\n\n{original_feedback.json()}\n\n\n\n\n")

chat_prompt = ChatPromptTemplate.from_messages([
("system", config.generate_suggestions_prompt.filter_feedback_system_message),
("human", config.generate_suggestions_prompt.filter_feedback_human_message)
])

prompt_inputs = FilterFeedbackInputs(
original_feedback=original_feedback.json(),
feedback_output_format=PydanticOutputParser(pydantic_object=AssessmentModel).get_format_instructions()
)

feedback_result = await predict_and_parse(
model=config.model.get_model(), # type: ignore[attr-defined]
chat_prompt=chat_prompt,
prompt_input=prompt_inputs.dict(),
pydantic_object=AssessmentModel,
tags=[
f"exercise-{exercise.exercise_id}-filter",
f"submission-{exercise.submission_id}-filter",
]
)

if debug:
emit_meta("filter_feedback", {
"prompt": chat_prompt.format(**prompt_inputs.dict()),
"result": feedback_result.dict() if feedback_result is not None else None
})

if feedback_result is None:
raise ValueError("No feedback was returned by the model.")

print(f"\n\n\n\n\n{feedback_result.json()}\n\n\n\n\n")

return feedback_result
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from athena.schemas.grading_criterion import StructuredGradingCriterion
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate

from athena import emit_meta
from module_modeling_llm.config import BasicApproachConfig
from module_modeling_llm.models.assessment_model import AssessmentModel
from module_modeling_llm.prompts.apollon_format_description import apollon_format_description
from module_modeling_llm.utils.predict_and_parse import predict_and_parse
from module_modeling_llm.prompts.graded_feedback_prompt import GradedFeedbackInputs
from module_modeling_llm.models.exercise_model import ExerciseModel

async def generate_suggestions(
exercise_model: ExerciseModel,
structured_grading_instructions: StructuredGradingCriterion,
config: BasicApproachConfig,
debug: bool) -> AssessmentModel:
"""
Generate feedback suggestions for modeling exercise submissions
:param exercise: The exercise for which a submission is assessed
:param submission: The submission that is assessed
:param is_graded: Indicates whether the submission is graded
:param config: A configuration object for the feedback module
:param debug: Indicates whether additional debugging information should be provided
:return: A list of feedback items for the assessed submission
"""

prompt_inputs = GradedFeedbackInputs(
submission=exercise_model.transformed_submission,
problem_statement=exercise_model.problem_statement,
max_points=exercise_model.max_points,
bonus_points=exercise_model.bonus_points,
structured_grading_instructions=structured_grading_instructions.json(),
submission_uml_type=exercise_model.submission_uml_type,
example_solution=exercise_model.transformed_example_solution,
uml_diagram_format=apollon_format_description,
feedback_output_format=PydanticOutputParser(pydantic_object=AssessmentModel).get_format_instructions()
)

chat_prompt = ChatPromptTemplate.from_messages([
("system", config.generate_suggestions_prompt.graded_feedback_system_message),
("human", config.generate_suggestions_prompt.graded_feedback_human_message)])

feedback_result = await predict_and_parse(
model=config.model.get_model(), # type: ignore[attr-defined]
chat_prompt=chat_prompt,
prompt_input=prompt_inputs.dict(),
pydantic_object=AssessmentModel,
tags=[
f"exercise-{exercise_model.exercise_id}",
f"submission-{exercise_model.submission_id}",
]
)

if debug:
emit_meta("generate_suggestions", {
"prompt": chat_prompt.format(**prompt_inputs.dict()),
"result": feedback_result.dict() if feedback_result is not None else None
})

if feedback_result is None:
raise ValueError("No feedback was generated")

return feedback_result
Loading

0 comments on commit d9ff3bd

Please sign in to comment.