ModelingLLM: Add Structured Grading Instruction Generation and Restru…

…cture Module (#340)
ls1intum · Sep 23, 2024 · d9ff3bd · d9ff3bd
1 parent bbbcbc9
commit d9ff3bd
Show file tree

Hide file tree

Showing 33 changed files with 542 additions and 411 deletions.
diff --git a/athena/athena/__init__.py b/athena/athena/__init__.py
@@ -4,7 +4,7 @@
 
 from . import contextvars
 from .app import app
-from .schemas import ExerciseType, GradingCriterion, StructuredGradingInstruction
+from .schemas import ExerciseType, GradingCriterion, StructuredGradingInstruction, StructuredGradingCriterion
 from .metadata import emit_meta, get_meta
 from .experiment import get_experiment_environment
 from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider, evaluation_provider  # type: ignore
@@ -36,5 +36,6 @@ def run_module():
     "get_experiment_environment",
     "ExerciseType",
     "GradingCriterion",
-    "StructuredGradingInstruction"
+    "StructuredGradingInstruction",
+    "StructuredGradingCriterion"
 ]
diff --git a/athena/athena/schemas/__init__.py b/athena/athena/schemas/__init__.py
@@ -13,4 +13,4 @@
 from .modeling_feedback import ModelingFeedback
 from .modeling_exercise import ModelingExercise
 from .modeling_submission import ModelingSubmission
-from .grading_criterion import GradingCriterion, StructuredGradingInstruction
+from .grading_criterion import GradingCriterion, StructuredGradingInstruction, StructuredGradingCriterion
diff --git a/athena/athena/schemas/grading_criterion.py b/athena/athena/schemas/grading_criterion.py
@@ -1,7 +1,7 @@
 from abc import ABC
 from typing import List, Optional
 
-from pydantic import Field
+from pydantic import BaseModel, Field
 
 from .schema import Schema
 
@@ -24,3 +24,6 @@ class GradingCriterion(Schema, ABC):
     structured_grading_instructions: List[StructuredGradingInstruction] = Field(
         [], example=[{"credits": 1.0, "gradingScale": "Good", "instructionDescription": "Some instructions", "feedback": "Nicely done!", "usageCount": 1},
                      {"credits": 0.0, "gradingScale": "Bad", "instructionDescription": "Some instructions", "feedback": "Try again!", "usageCount": 0}])
+
+class StructuredGradingCriterion(BaseModel):
+    criteria: List[GradingCriterion]
diff --git a/modules/modeling/module_modeling_llm/module_modeling_llm/__main__.py b/modules/modeling/module_modeling_llm/module_modeling_llm/__main__.py
@@ -5,9 +5,13 @@
 
 from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider
 from athena.logger import logger
-from athena.modeling import Exercise, Submission, Feedback
+from athena.modeling import Exercise, Feedback, Submission
 from module_modeling_llm.config import Configuration
-from module_modeling_llm.generate_suggestions import generate_suggestions
+from module_modeling_llm.core.filter_feedback import filter_feedback
+from module_modeling_llm.core.generate_suggestions import generate_suggestions
+from module_modeling_llm.core.get_structured_grading_instructions import get_structured_grading_instructions
+from module_modeling_llm.utils.convert_to_athana_feedback_model import convert_to_athana_feedback_model
+from module_modeling_llm.utils.get_exercise_model import get_exercise_model
 
 
 @submissions_consumer
@@ -31,7 +35,26 @@ def process_incoming_feedback(exercise: Exercise, submission: Submission, feedba
 async def suggest_feedback(exercise: Exercise, submission: Submission, is_graded: bool, module_config: Configuration) -> List[Feedback]:
     logger.info("suggest_feedback: Suggestions for submission %d of exercise %d were requested", submission.id,
                 exercise.id)
-    return await generate_suggestions(exercise, submission, is_graded, module_config.approach, module_config.debug)
+
+    # First, we convert the incoming exercise and submission to our internal models and textual representations
+    exercise_model = get_exercise_model(exercise, submission)
+
+    # Next, we retrieve or generate the structured grading instructions for the exercise
+    structured_grading_instructions = await get_structured_grading_instructions(
+        exercise_model, module_config.approach, exercise.grading_instructions, exercise.grading_criteria, module_config.debug
+    )
+
+    # Finally, we generate feedback suggestions for the submission
+    feedback = await generate_suggestions(
+        exercise_model, structured_grading_instructions, module_config.approach, module_config.debug
+    )
+
+    # If the submission is not graded (Student is requesting feedback), we reformulate the feedback to not give away the solution
+    if is_graded is False:
+        feedback = await filter_feedback(exercise_model, feedback, module_config.approach, module_config.debug)
+
+    return convert_to_athana_feedback_model(feedback, exercise_model)
+
 
 
 if __name__ == "__main__":

diff --git a/...m/module_modeling_llm/helpers/__init__.py → ...eling_llm/apollon_transformer/__init__.py b/...m/module_modeling_llm/helpers/__init__.py → ...eling_llm/apollon_transformer/__init__.py
diff --git a/...s/serializers/diagram_model_serializer.py → ...n_transformer/apollon_json_transformer.py b/...s/serializers/diagram_model_serializer.py → ...n_transformer/apollon_json_transformer.py
@@ -1,11 +1,12 @@
-from typing import Optional
-from module_modeling_llm.helpers.serializers.parser.uml_parser import UMLParser
+import json
 
+from module_modeling_llm.apollon_transformer.parser.uml_parser import UMLParser
 
-class DiagramModelSerializer:
+
+class ApollonJSONTransformer:
 
     @staticmethod
-    def serialize_model(model: dict) -> tuple[Optional[str], dict[str, str]]:
+    def transform_json(model: str) -> tuple[str, dict[str, str], str]:
         """
         Serialize a given Apollon diagram model to a string representation.
         This method converts the UML diagram model into a format similar to mermaid syntax, called "apollon".
@@ -14,7 +15,12 @@ def serialize_model(model: dict) -> tuple[Optional[str], dict[str, str]]:
         :return: A tuple containing the serialized model as a string and a dictionary mapping element and relation names
                  to their corresponding IDs.
         """
-        parser = UMLParser(model)
+
+        model_dict = json.loads(model)
+
+        parser = UMLParser(model_dict)
+
+        diagram_type = model_dict.get("type", "unknown")
 
         # Convert the UML diagram to the apollon representation
         apollon_representation = parser.to_apollon()
@@ -25,5 +31,5 @@ def serialize_model(model: dict) -> tuple[Optional[str], dict[str, str]]:
             **{relation['name']: relation['id'] for relation in parser.get_relations()}
         }
 
-        return apollon_representation, names
+        return apollon_representation, names, diagram_type
 
diff --git a/...llm/helpers/serializers/parser/element.py → ...llm/apollon_transformer/parser/element.py b/...llm/helpers/serializers/parser/element.py → ...llm/apollon_transformer/parser/element.py
@@ -27,11 +27,6 @@ def resolve_references(self, element_dict: Dict[str, Any]):
         self.attributes = [element_dict[ref].get("name", "") for ref in self.attribute_refs if ref in element_dict]
         self.methods = [element_dict[ref].get('name', '') for ref in self.method_refs if ref in element_dict]
 
-        for ref_list, target_list in [(self.attribute_refs, self.attributes), (self.method_refs, self.methods)]:
-            target_list.extend(
-                element_dict.get(ref, {}).get("name", "") for ref in ref_list if ref in element_dict
-            )
-
     def to_apollon(self) -> str:
         parts = [f"[{self.type}] {self.name}"]
 

diff --git a/...lm/helpers/serializers/parser/relation.py → ...lm/apollon_transformer/parser/relation.py b/...lm/helpers/serializers/parser/relation.py → ...lm/apollon_transformer/parser/relation.py
diff --git a/.../helpers/serializers/parser/uml_parser.py → .../apollon_transformer/parser/uml_parser.py b/.../helpers/serializers/parser/uml_parser.py → .../apollon_transformer/parser/uml_parser.py
@@ -1,8 +1,8 @@
 from typing import Dict, Any, List
 from string import ascii_uppercase
 
-from module_modeling_llm.helpers.serializers.parser.element import Element
-from module_modeling_llm.helpers.serializers.parser.relation import Relation
+from module_modeling_llm.apollon_transformer.parser.element import Element
+from module_modeling_llm.apollon_transformer.parser.relation import Relation
 
 
 class UMLParser:
@@ -42,9 +42,14 @@ def _parse(self) -> None:
         for element_data in self.data['elements'].values():
             if element_data.get('id') not in referenced_ids:
                 name = element_data.get('name')
-                if name_count[name] > 1:
-                    suffix_index = name_suffix_counters[name]
-                    element_data['name'] = f"{name}{ascii_uppercase[suffix_index]}"
+                suffix_index = name_suffix_counters[name]
+
+                if name == '':
+                    element_data['name'] = f"##{ascii_uppercase[suffix_index]}"
+                    if name_count[name] > 1:
+                        name_suffix_counters[name] += 1
+                elif name_count[name] > 1:
+                    element_data['name'] = f"{name}#{ascii_uppercase[suffix_index]}"
                     name_suffix_counters[name] += 1
 
                 element = Element(element_data, self.data['elements'])

diff --git a/modules/modeling/module_modeling_llm/module_modeling_llm/config.py b/modules/modeling/module_modeling_llm/module_modeling_llm/config.py
@@ -1,15 +1,13 @@
 from pydantic import BaseModel, Field
 
 from athena import config_schema_provider
-from module_modeling_llm.helpers.models import ModelConfigType, DefaultModelConfig
-from module_modeling_llm.prompts.generate_suggestions import (
-    graded_feedback_system_message as default_graded_feedback_system_message,
-    graded_feedback_human_message as default_graded_feedback_human_message,
-    filter_feedback_system_message as default_filter_feedback_system_message,
-    filter_feedback_human_message as default_filter_feedback_human_message
+from module_modeling_llm.models import ModelConfigType, DefaultModelConfig
+from module_modeling_llm.prompts import (
+    graded_feedback_prompt,
+    filter_feedback_prompt,
+    structured_grading_instructions_prompt
 )
 
-
 class GenerateSuggestionsPrompt(BaseModel):
     """
     Features available: **{problem_statement}**, **{example_solution}**, **{grading_instructions}**, **{max_points}**,
@@ -18,25 +16,38 @@ class GenerateSuggestionsPrompt(BaseModel):
     _Note: **{problem_statement}**, **{example_solution}**, or **{grading_instructions}** might be omitted if the input
     is too long._
     """
-    graded_feedback_system_message: str = Field(default=default_graded_feedback_system_message,
-        description="Message for priming AI behavior and instructing it what to do.")
-    graded_feedback_human_message: str = Field(default=default_graded_feedback_human_message,
-        description="Message from a human. The input on which the AI is supposed to act.")
-    filter_feedback_system_message: str = Field(default=default_filter_feedback_system_message,
-        description="Message for priming AI behavior for filtering ungraded feedback.")
-    filter_feedback_human_message: str = Field(default=default_filter_feedback_human_message,
-        description="Message for instructing AI to filter ungraded feedback.")
-
-
+    graded_feedback_system_message: str = Field(
+        default=graded_feedback_prompt.graded_feedback_system_message,
+        description="Message for priming AI behavior and instructing it what to do."
+    )
+    graded_feedback_human_message: str = Field(
+        default=graded_feedback_prompt.graded_feedback_human_message,
+        description="Message from a human. The input on which the AI is supposed to act."
+    )
+    filter_feedback_system_message: str = Field(
+        default=filter_feedback_prompt.filter_feedback_system_message,
+        description="Message for priming AI behavior for filtering ungraded feedback."
+    )
+    filter_feedback_human_message: str = Field(
+        default=filter_feedback_prompt.filter_feedback_human_message,
+        description="Message for instructing AI to filter ungraded feedback."
+    )
+    structured_grading_instructions_system_message: str = Field(
+        default=structured_grading_instructions_prompt.structured_grading_instructions_system_message,
+        description="Message for instructing AI to structure the Problem Statement"
+    )
+    structured_grading_instructions_human_message: str = Field(
+        default=structured_grading_instructions_prompt.structured_grading_instructions_human_message,
+        description="Message for instructing AI to filter ungraded feedback."
+    )
 
 class BasicApproachConfig(BaseModel):
     """This approach uses a LLM with a single prompt to generate feedback in a single step."""
     max_input_tokens: int = Field(default=3000, description="Maximum number of tokens in the input prompt.")
     model: ModelConfigType = Field(default=DefaultModelConfig())  # type: ignore
     generate_suggestions_prompt: GenerateSuggestionsPrompt = Field(default=GenerateSuggestionsPrompt())
 
-
 @config_schema_provider
 class Configuration(BaseModel):
     debug: bool = Field(default=False, description="Enable debug mode.")
-    approach: BasicApproachConfig = Field(default=BasicApproachConfig())
+    approach: BasicApproachConfig = Field(default=BasicApproachConfig())
diff --git a/modules/modeling/module_modeling_llm/module_modeling_llm/core/filter_feedback.py b/modules/modeling/module_modeling_llm/module_modeling_llm/core/filter_feedback.py
@@ -0,0 +1,52 @@
+from langchain_core.output_parsers import PydanticOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+
+from athena import emit_meta
+from module_modeling_llm.config import BasicApproachConfig
+from module_modeling_llm.utils.predict_and_parse import predict_and_parse
+from module_modeling_llm.models.assessment_model import AssessmentModel
+from module_modeling_llm.models.exercise_model import ExerciseModel
+from module_modeling_llm.prompts.filter_feedback_prompt import FilterFeedbackInputs
+
+async def filter_feedback(
+        exercise: ExerciseModel,
+        original_feedback: AssessmentModel,
+        config: BasicApproachConfig,
+        debug: bool,
+) -> AssessmentModel:
+
+    print(f"\n\n\n\n\n{original_feedback.json()}\n\n\n\n\n")
+
+    chat_prompt = ChatPromptTemplate.from_messages([
+        ("system", config.generate_suggestions_prompt.filter_feedback_system_message),
+        ("human", config.generate_suggestions_prompt.filter_feedback_human_message)
+    ])
+
+    prompt_inputs = FilterFeedbackInputs(
+        original_feedback=original_feedback.json(),
+        feedback_output_format=PydanticOutputParser(pydantic_object=AssessmentModel).get_format_instructions()
+    )
+
+    feedback_result = await predict_and_parse(
+        model=config.model.get_model(), # type: ignore[attr-defined]
+        chat_prompt=chat_prompt,
+        prompt_input=prompt_inputs.dict(),
+        pydantic_object=AssessmentModel,
+        tags=[
+            f"exercise-{exercise.exercise_id}-filter",
+            f"submission-{exercise.submission_id}-filter",
+        ]
+    )
+
+    if debug:
+        emit_meta("filter_feedback", {
+            "prompt": chat_prompt.format(**prompt_inputs.dict()),
+            "result": feedback_result.dict() if feedback_result is not None else None
+        })
+
+    if feedback_result is None:
+        raise ValueError("No feedback was returned by the model.")
+
+    print(f"\n\n\n\n\n{feedback_result.json()}\n\n\n\n\n")
+
+    return feedback_result
diff --git a/modules/modeling/module_modeling_llm/module_modeling_llm/core/generate_suggestions.py b/modules/modeling/module_modeling_llm/module_modeling_llm/core/generate_suggestions.py
@@ -0,0 +1,64 @@
+from athena.schemas.grading_criterion import StructuredGradingCriterion
+from langchain_core.output_parsers import PydanticOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+
+from athena import emit_meta
+from module_modeling_llm.config import BasicApproachConfig
+from module_modeling_llm.models.assessment_model import AssessmentModel
+from module_modeling_llm.prompts.apollon_format_description import apollon_format_description
+from module_modeling_llm.utils.predict_and_parse import predict_and_parse
+from module_modeling_llm.prompts.graded_feedback_prompt import GradedFeedbackInputs
+from module_modeling_llm.models.exercise_model import ExerciseModel
+
+async def generate_suggestions(
+        exercise_model: ExerciseModel, 
+        structured_grading_instructions: StructuredGradingCriterion,
+        config: BasicApproachConfig,
+        debug: bool) -> AssessmentModel:
+    """
+    Generate feedback suggestions for modeling exercise submissions
+    :param exercise: The exercise for which a submission is assessed
+    :param submission: The submission that is assessed
+    :param is_graded: Indicates whether the submission is graded
+    :param config: A configuration object for the feedback module
+    :param debug: Indicates whether additional debugging information should be provided
+    :return: A list of feedback items for the assessed submission
+    """
+
+    prompt_inputs = GradedFeedbackInputs(
+        submission=exercise_model.transformed_submission,
+        problem_statement=exercise_model.problem_statement,
+        max_points=exercise_model.max_points,
+        bonus_points=exercise_model.bonus_points,
+        structured_grading_instructions=structured_grading_instructions.json(),
+        submission_uml_type=exercise_model.submission_uml_type,
+        example_solution=exercise_model.transformed_example_solution,
+        uml_diagram_format=apollon_format_description,
+        feedback_output_format=PydanticOutputParser(pydantic_object=AssessmentModel).get_format_instructions()
+    )
+
+    chat_prompt = ChatPromptTemplate.from_messages([
+        ("system", config.generate_suggestions_prompt.graded_feedback_system_message),
+        ("human", config.generate_suggestions_prompt.graded_feedback_human_message)])
+
+    feedback_result = await predict_and_parse(
+        model=config.model.get_model(), # type: ignore[attr-defined]
+        chat_prompt=chat_prompt,
+        prompt_input=prompt_inputs.dict(),
+        pydantic_object=AssessmentModel,
+        tags=[
+            f"exercise-{exercise_model.exercise_id}",
+            f"submission-{exercise_model.submission_id}",
+        ]
+    )
+
+    if debug:
+        emit_meta("generate_suggestions", {
+            "prompt": chat_prompt.format(**prompt_inputs.dict()),
+            "result": feedback_result.dict() if feedback_result is not None else None
+        })
+
+    if feedback_result is None:
+        raise ValueError("No feedback was generated")
+
+    return feedback_result