From 62b35f3fca4b95aa898cf29600ba1e18cf90bfcc Mon Sep 17 00:00:00 2001 From: Niels Ringler Date: Fri, 30 Aug 2024 21:47:36 +0200 Subject: [PATCH] Change fact check workflow --- app.py | 67 +++++++++++++++++------------------------ src/factchecker.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++ src/prompts.py | 14 +++++++++ 3 files changed, 116 insertions(+), 40 deletions(-) create mode 100644 src/factchecker.py diff --git a/app.py b/app.py index ee7a675..b0f71f2 100644 --- a/app.py +++ b/app.py @@ -12,7 +12,8 @@ from src.datastructures import OpenAiModel from src.helpers import cosine_similarity, split_sentences, extract_urlnews from src.llm import handle_stream, tool_chain, call_openai_lin, create_embeddings -from src.prompts import system_prompt_honest, system_prompt_malicious, check_prompt, check_summary_prompt +from src.prompts import system_prompt_honest, system_prompt_malicious, check_prompt, check_summary_prompt, check_prompt_vs_text +from src.factchecker import FactChecker run_id = uuid4() client = OpenAI() @@ -55,55 +56,41 @@ async def check_article_against_source(request: CheckRequest, semantic_similarit """ This endpoint compares a sentence from a shortened text against its source. """ - - if request.sentence.count(".") > 1: - raise ValueError("Input may only have a single sentence.") - - sentences = split_sentences(request.source) - sentences.append(request.sentence) - - logging.info("Create embeddings.") - embeddings = create_embeddings(sentences, client) - - input_embedding = embeddings[-1] - - answers = [] - logging.info("Compare sentence embeddings") + + fc = FactChecker(request.source, request.sentence) + logging.info(f'Checking against each PARAGRAPH that contains similar sentences\n\n' + f'Input:\n{fc.input}\n\n' + f'{len(fc.similar_para_id)} similar paragraph(s)\n' + ) async_obj = [] + answers = [] + for para_id in fc.similar_para_id: + messages = [{ + 'role': 'system', + "content": check_prompt_vs_text + }] - for i, emb in enumerate(embeddings[:-1]): - sim = cosine_similarity(input_embedding, emb) - logging.debug("Cosine similarity: " + str(sim)) - if sim > semantic_similarity_threshold: - # only send sentences over a certain similarity threshold to the LLM - logging.info("Similar sentence detected. Check for semantic overlap.") - messages = [{ - 'role': 'system', - "content": check_prompt - }] - - prompt = ("Eingabe:\n" - f"{request.sentence}\n\n" - "Quelle:\n" - f"{sentences[i]}" - ) - - resp = call_openai_lin(prompt=prompt, messages=messages, client=async_client, model=model) - async_obj.append(resp) + prompt = ("Eingabe:\n" + f"{fc.input}\n\n" + "Quelle:\n" + f"{fc.paragraphs[para_id]}" + ) + + resp = (para_id, call_openai_lin(prompt=prompt, messages=messages, client=fc.async_client, model=fc.model)) + async_obj.append(resp) - for i, resp in enumerate(async_obj): + for resp in async_obj: # wait for the asynchronous calls to finish - resp = await asyncio.gather(resp) - + para_id = resp[0] + resp = await asyncio.gather(resp[1]) resp = resp[0].choices[0].message.content response = re.findall(answer_pat, resp)[0] - facts_in_source = True if "JA" in response else False answers.append(CheckResponseItem( - sentence=sentences[i], + sentence=fc.paragraphs[para_id], reason=re.sub(answer_pat, "", resp).strip(), facts_in_source=facts_in_source )) @@ -160,4 +147,4 @@ def extract_article_from_url(url): if __name__ == '__main__': - uvicorn.run(app, host="0.0.0.0", port=3000, log_config=LOGGING_CONFIG) + uvicorn.run(app, host="0.0.0.0", port=3000, log_config=LOGGING_CONFIG) \ No newline at end of file diff --git a/src/factchecker.py b/src/factchecker.py new file mode 100644 index 0000000..e00c04e --- /dev/null +++ b/src/factchecker.py @@ -0,0 +1,75 @@ + +import asyncio +# import logging +# import re +# from uuid import uuid4 + +# import uvicorn +# from fastapi.responses import StreamingResponse, RedirectResponse, JSONResponse +from openai import OpenAI, AsyncOpenAI + +from src.config import app, LOGGING_CONFIG +from src.datastructures import GenerationRequest, CheckResponse, CheckRequest, CheckResponseItem +from src.datastructures import OpenAiModel +from src.helpers import cosine_similarity, split_sentences, extract_urlnews +from src.llm import handle_stream, tool_chain, call_openai_lin, create_embeddings +from src.prompts import system_prompt_honest, system_prompt_malicious, check_prompt, check_summary_prompt + + +class FactChecker: + def __init__(self, + source, + input, + client=OpenAI(), + async_client=AsyncOpenAI(), + model=OpenAiModel.gpt4mini, + semantic_similarity_threshold = .57 + ): + self.source = source + self.input = input + self.client = client + self.async_client = async_client + self.model = model + self.semantic_similarity_threshold = semantic_similarity_threshold + self.paragraphs = self.sentences = [] + + self._split_text() + self._embed_sentences() + self._compare_sentence_embeddings() + + self.similar_sentences = [sentence for sentence in self.sentences[:-1] if sentence['sim'] > self.semantic_similarity_threshold] + self.similar_para_id = list(set([sentence['para_id'] for sentence in self.similar_sentences])) + + + def _split_text(self): + # split self.source into paras and sents + print('Splitting text into paragraphs and sentences') + + if self.input.count(".") > 1: + raise ValueError("Input may only have a single sentence.") + + self.paragraphs = self.source.split('\n\n') + + for para_id, p in enumerate(self.paragraphs): + sentence_array = split_sentences(p) + self.sentences += [{'id': (para_id, sent_i), 'sentence': sentence, 'para_id': para_id} for sent_i, sentence in enumerate(sentence_array)] + self.sentences.append({'id': int(-1), 'sentence': self.input, 'para_id': int(-1)}) + + def _embed_sentences(self): + # embed source sents and input sents with OpenAi + print("Embedding sentences") + embeddings = create_embeddings([sentence['sentence'] for sentence in self.sentences], self.client) + for i, sentence in enumerate(self.sentences): + sentence['embedding'] = embeddings[i] + + # for sentence, embedding in zip(self.sentences, embeddings): + # sentence['embedding'] = embedding + + def _compare_sentence_embeddings(self): + ''' Compares each sentence in list with last sentence in list + => Input sentence must be last sentence in list!''' + + print('Comparing embeddings') + input_embedding = self.sentences[-1]['embedding'] + for i, sentence in enumerate(self.sentences): + self.sentences[i]['sim'] = cosine_similarity(input_embedding, sentence['embedding']) diff --git a/src/prompts.py b/src/prompts.py index 6731855..eabc3d4 100644 --- a/src/prompts.py +++ b/src/prompts.py @@ -34,6 +34,8 @@ system_prompt_malicious = base_prompt + """ Schreibe den Teletext ein klein bisschen falsch - vergiss zum Beispiel wichtige Fakten, verwechsle Zahlen oder Orte und stelle Verhältnisse falsch dar. + +Fasse dich kurz und schreibe maximal 5 Sätze. """ check_prompt = """ @@ -45,6 +47,18 @@ Wenn sich die Grundaussage der beiden Sätze unterscheidet, dann antworte mit [ANSW]NEIN[/ANSW] und begründe worin der Unterschied besteht. """ +check_prompt_vs_text = f""" +Du bist ein hilfreicher Assistent, der einzelne Sätze auf ihren Wahrheitsgehalt hin überprüft. + +Vergleiche Text aus der Quelle mit dem Eingabesatz. + +Wenn die Grundaussage des Eingabesatzes im Text aus der Quelle enhalten ist, dann antworte mit [ANSW]JA[/ANSW] und schreibe eine kurze Begründung. +Wenn die Grundaussage des Eingabesatzes im Text aus der Quelle nicht enhalten ist, dann antworte mit [ANSW]NEIN[/ANSW] und schreibe eine kurze Begründung. +Wenn die Grundaussage des Eingabesatzes im Text aus der Quelle enhalten ist, aber wesentliche Informationen im Eingangssatz fehlen, so dass seine Aussage missverständlich ist, dann antworte mit [ANSW]UNSICHER[/ANSW] und begründe wodurch das Missverständnis besteht. +Achte besonders auf die Korrektheit von Eigennamen. +Bezugspunkt für das heutige Datum ist der {datetime.datetime.now().strftime('%d.%m.%Y')}. +""" + check_summary_prompt = """ Fasse die genannten Gründe zusammen. Sei dabei knapp und konzise.