diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 2c2f702..5b00dcc 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -1,4 +1,4 @@ -name: Docker +name: Docker build & push # This workflow uses actions that are not certified by GitHub. # They are provided by a third-party and are governed by diff --git a/src/templates/form.html b/src/templates/form.html index 9dac9e0..620cae8 100644 --- a/src/templates/form.html +++ b/src/templates/form.html @@ -1,14 +1,54 @@ - + - Simple Form + + + YoYo MaskR -

upload text to mask

-
-
-

- +

Input:

+ + +
+ +

Response:

+ + + - \ No newline at end of file + diff --git a/src/utils/llm.py b/src/utils/llm.py index 45c7ed0..0e01795 100644 --- a/src/utils/llm.py +++ b/src/utils/llm.py @@ -8,7 +8,7 @@ Task instructions: Analyze the text provided after 'Text to anonymize' carefully for all names of persons and places. For each name that you find evaluate whether it is a new name or just a repetition or variation of a name you have already found before. Names of persons are labeled as #person_1#, #person_2#, etc. Names of places are labeled as #place_1#, #place_2#, etc. -Only return a json dictionary without any comments or markdown formatting around it. +Only return a json dictionary without any comments or markdown formatting around it. Do not return keys without values. Example input: 'Tony Stark and Peter Parker walk through New York where Peter wants to show Tony the Broadway.'. Example output: {{"#person_1#": ["Tony Stark", "Tony"], "#person_2#": ["Peter Parker", "Peter"], "#place_1#": ["New York"], "#place_2#": ["Broadway"]}} Text to anonymize: {text} @@ -31,4 +31,6 @@ def llm_find_entities(text, temperature=0, template=TEMPLATE, raw=False): result = chain.invoke({"text": text}) if raw: return result - return {k: sorted(v, key=len, reverse=True) for k, v in json.loads(result).items()} + + ret = {k: v for k, v in json.loads(result).items()} + return {k: set(v) for k, v in ret.items() if v} diff --git a/tests/test_llm.py b/tests/test_llm.py new file mode 100644 index 0000000..23f173e --- /dev/null +++ b/tests/test_llm.py @@ -0,0 +1,36 @@ +import pytest +from src.utils.llm import llm_find_entities + +def test_llm_find_entities_basic(): + text = "Tony Stark and Peter Parker walk through New York where Peter wants to show Tony the Broadway." + expected_output = {'#person_1#': {'Tony Stark', 'Tony'}, '#person_2#': {'Peter Parker', 'Peter'}, '#place_1#': {'New York'}, '#place_2#': {'Broadway'}} + result = llm_find_entities(text) + assert result == expected_output + assert result == expected_output + +def test_llm_find_entities_no_entities(): + text = "This is a text without any names of persons or places." + expected_output = {} + result = llm_find_entities(text) + assert result == expected_output + +def test_llm_find_entities_repeated_names(): + text = "Alice and Bob went to Wonderland. Alice met Bob at the Wonderland park." + expected_output = { + "#person_1#": ["Alice"], + "#person_2#": ["Bob"], + "#place_1#": ["Wonderland"], + '#place_2#': ['Wonderland park'] + } + result = llm_find_entities(text) + +def test_llm_find_entities_raw_output(): + text = "Tony Stark and Peter Parker walk through New York where Peter wants to show Tony the Broadway." + result = llm_find_entities(text, raw=True) + print("result") + print(result) + assert isinstance(result, str) + assert "Tony Stark" in result + assert "Peter Parker" in result + assert "New York" in result + assert "Broadway" in result \ No newline at end of file