[Bug] Token Classification emojis cause overlapping spans error & wrong annotations #2353

cceyda · 2023-02-16T12:57:40Z

Describe the bug
If there is a prediction annotation mismatch + an emoji 💚 (haven't tested with other emojis)

on the UI this shows error:

I was told clearing all annotations and then annotating and saving works sometimes!

on the server side caused by ValueError: IOB tags cannot handle overlapping spans!

argilla_1        | ERROR:    Exception in ASGI application
argilla_1        | Traceback (most recent call last):
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/uvicorn/protocols/http/httptools_impl.py", line 419, in run_asgi
argilla_1        |     result = await app(  # type: ignore[func-returns-value]
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/uvicorn/middleware/proxy_headers.py", line 78, in __call__
argilla_1        |     return await self.app(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/fastapi/applications.py", line 270, in __call__
argilla_1        |     await super().__call__(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/applications.py", line 124, in __call__
argilla_1        |     await self.middleware_stack(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/middleware/errors.py", line 184, in __call__
argilla_1        |     raise exc
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/middleware/errors.py", line 162, in __call__
argilla_1        |     await self.app(scope, receive, _send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/middleware/exceptions.py", line 79, in __call__
argilla_1        |     raise exc
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/middleware/exceptions.py", line 68, in __call__
argilla_1        |     await self.app(scope, receive, sender)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/fastapi/middleware/asyncexitstack.py", line 21, in __call__
argilla_1        |     raise e
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/fastapi/middleware/asyncexitstack.py", line 18, in __call__
argilla_1        |     await self.app(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/routing.py", line 706, in __call__
argilla_1        |     await route.handle(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/routing.py", line 443, in handle
argilla_1        |     await self.app(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/fastapi/applications.py", line 270, in __call__
argilla_1        |     await super().__call__(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/applications.py", line 124, in __call__
argilla_1        |     await self.middleware_stack(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/middleware/errors.py", line 184, in __call__
argilla_1        |     raise exc
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/middleware/errors.py", line 162, in __call__
argilla_1        |     await self.app(scope, receive, _send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/brotli_asgi/__init__.py", line 85, in __call__
argilla_1        |     await responder(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/middleware/gzip.py", line 44, in __call__
argilla_1        |     await self.app(scope, receive, self.send_with_gzip)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/middleware/cors.py", line 92, in __call__
argilla_1        |     await self.simple_response(scope, receive, send, request_headers=headers)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/middleware/cors.py", line 147, in simple_response
argilla_1        |     await self.app(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/middleware/exceptions.py", line 79, in __call__
argilla_1        |     raise exc
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/middleware/exceptions.py", line 68, in __call__
argilla_1        |     await self.app(scope, receive, sender)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/fastapi/middleware/asyncexitstack.py", line 21, in __call__
argilla_1        |     raise e
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/fastapi/middleware/asyncexitstack.py", line 18, in __call__
argilla_1        |     await self.app(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/routing.py", line 706, in __call__
argilla_1        |     await route.handle(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/routing.py", line 276, in handle
argilla_1        |     await self.app(scope, receive, send)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/starlette/routing.py", line 66, in app
argilla_1        |     response = await func(request)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/fastapi/routing.py", line 235, in app
argilla_1        |     raw_response = await run_endpoint_function(
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/fastapi/routing.py", line 161, in run_endpoint_function
argilla_1        |     return await dependant.call(**values)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/argilla/server/apis/v0/handlers/token_classification.py", line 125, in bulk_records
argilla_1        |     result = await service.add_records(
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/argilla/server/services/tasks/token_classification/service.py", line 64, in add_records
argilla_1        |     failed = await self.__storage__.store_records(
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/argilla/server/services/storage/service.py", line 66, in store_records
argilla_1        |     record.metrics = metrics.record_metrics(record)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/argilla/server/services/tasks/token_classification/metrics.py", line 296, in record_metrics
argilla_1        |     annotated_tags = cls._compute_iob_tags(span_utils, record.annotation) or []
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/argilla/server/services/tasks/token_classification/metrics.py", line 340, in _compute_iob_tags
argilla_1        |     return span_utils.to_tags(spans)
argilla_1        |   File "/usr/local/lib/python3.9/site-packages/argilla/utils/span_utils.py", line 163, in to_tags
argilla_1        |     raise ValueError("IOB tags cannot handle overlapping spans!")
argilla_1        | ValueError: IOB tags cannot handle overlapping spans!

Steps to reproduce
I'm using char tokens:

text='💚abcd💚efgjklm.04 / abcde fgjklm (aaaaa)'
tokens=['💚','a','b','c','d','💚','e','f','g','j','k','l','m','.','0','4','/','a', 'b','c','d','e','f','g','j','k','l', 'm','(','a','a','a','a','a',')']
entities= [('A', 6, 13), ('B', 25, 31), ('C', 33, 38)]
record = rg.TokenClassificationRecord(
        text=text,
        tokens=tokens,
        prediction=entities,
        prediction_agent="annotator",
        # annotation=entities,
        # annotation_agent="old",
        metadata= {},
        status='Default',
        id=0
    )

⚠️ This also causes the pred alignment to be off by 1char on the UI

Environment (please complete the following information):

OS [e.g. iOS]: linux
Browser [e.g. chrome, safari]: chrome
Argilla Version [e.g. 1.0.0]: 1.2.1
ElasticSearch Version [e.g. 7.10.2]: elasticsearch:8.5.3
Docker Image (optional) [e.g. argilla:v1.0.0]: argilla-server:v1.2.1
Python 3.9.12

Additional context
If you are validating multiple records at once and one of them fails the others fail too, and if the annotator is not careful about the toast error message shown moves on to the next page then their annotation on the prev page are lost.
Maybe we can show an alert() popup when there are unsaved annotations and the user is trying to navigate out?

The text was updated successfully, but these errors were encountered:

tomaarsen · 2023-02-16T13:22:03Z

I've found a related issue.

import argilla as rg

records = [
    rg.TokenClassificationRecord(
        text="I ❤️ you", tokens=["I", "❤️", "you"], prediction=[("I", 0, 1), ("emoji", 2, 4), ("you", 5, 8)]
    ),
    rg.TokenClassificationRecord(
        text="I 💚 you", tokens=["I", "💚", "you"], prediction=[("I", 0, 1), ("emoji", 2, 3), ("you", 4, 7)]
    ),
    rg.TokenClassificationRecord(
        text="I h you", tokens=["I", "h", "you"], prediction=[("I", 0, 1), ("emoji", 2, 3), ("you", 4, 7)]
    ),
]

rg.delete("issue_2353_emoji")
rg.log(records, "issue_2353_emoji")

produces:

Note also the following awkward Python behaviour:

>>> len("h")
1
>>> len("💚")
1
>>> len("❤️")
2
>>>

cceyda · 2023-02-16T13:27:06Z

emoji, halfwidth chars, double chars are never easy to work with 🥲

tomaarsen · 2023-02-16T13:33:10Z

I've expanded on @cceyda's useful example with the red heart to show that the issue only exists with the colored one:

import argilla as rg

rg.delete("issue_2353_emoji")

tokens = ["💚", "a", "b", "c", "d", "💚", "e", "f", "g", "j", "k", "l"]
text = "💚abcd💚efgjkl"
entities = [("A", 6, 9)]
assert text[6:9] == "efg"
record = rg.TokenClassificationRecord(
    text=text,
    tokens=tokens,
    prediction=entities,
)

rg.log(record, "issue_2353_emoji")

tokens = ["❤️", "a", "b", "c", "d", "❤️", "e", "f", "g", "j", "k", "l"]
text = "❤️abcd❤️efgjkl"
entities = [("A", 8, 11)]
assert text[8:11] == "efg"
record = rg.TokenClassificationRecord(
    text=text,
    tokens=tokens,
    prediction=entities,
)

rg.log(record, "issue_2353_emoji")

produces:

tomaarsen · 2023-02-16T13:38:52Z

And this seems to be the issue. In Python, the green heart only has length 1, while on the JS side it has length 2. As a result, the required prediction start and end spans on Python simply slice differently on JS.
Note for example that in my last comment I had to use different prediction spans to select only "efg".

cceyda · 2023-02-16T14:07:33Z

I saw there was a related PR 1year ago 8b570fb

dvsrepo · 2023-02-16T14:12:36Z

Maybe @leiyre can give more context

cceyda · 2023-02-16T14:32:47Z

https://hsivonen.fi/string-length/ "🤦🏼‍♂️","🤦🏼","💖", "💘", "💝", "💞", "❣️", "✨".
I think converting between js<->py lengths can solve this.
there are too many emojis and strange width chars when working with multiple languages & this would be critical

leiyre · 2023-02-16T14:36:45Z

Hi, in 8b570fb we included stringz in the front to avoid the problem with unicode and javascript. @tomaarsen could I see your example in dev or pre to explore a bit more the behavior?

tomaarsen · 2023-02-16T14:39:11Z

@leiyre I've pushed my latest example to dev under issue_2353_emoji. I think you should be able to access it there now.

cceyda · 2023-02-16T15:56:02Z

Here is another thing:

Screen.Recording.2023-02-17.at.0.38.25.mov

I mean I understand why it happens but not how to fix it 🤣 .
Just playing around in a notebook you can see why

emojis=["💖", "💘", "💝", "💞","💚","❣️", "✨","🤦🏼‍♂️","🤦🏼"]
for e in emojis:
    print(e,len(e),list(e))

I don't really care about the visual artifact on the UI as long as the annotation boundries are passed correctly when using in python! like text[annotation.start:annotation.end]

label_map={
    "A" : "location","B" : "organization","C" : "building", "D" : "vegetable","E" : "fruit","F" : "street","G" : "apt", "J" : "no",
    "K" : "food","L" : "drink","M" : "alcohol","N" : "name","O" : "phone","P" : "etc","R" : "blah","S" : "something","T" : "somethingelse"
}
labels=list(label_map.keys())
texts=[
    'ABCDEFG 🤦🏼‍♂️ jklmnop',
    'ABCDEFG 💞 jklmnop',
    'ABCDEFG 💚jklmnop',
    'ABCDEFG (❣️) jklmnop'
    ]
records=[]
for i,text in enumerate(texts):
    record = rg.TokenClassificationRecord(
                    text=text,
                    tokens=list(text.replace(" ","")),
                    prediction=[(labels[i],i,i+1)],
                    prediction_agent="model",
                    # annotation=entities,
                    # annotation_agent="old",
                    metadata= { },
                    status='Default', 
                    id=i
                )
    records.append(record)
rg.log(records=records, name="test-emoji")

Also question would this tokens=list(text.replace(" ","")) be the way to pass chars to do char level token annotation.
(maybe I'm doing it wrong, no spaces right?)

keithCuniah · 2023-03-30T09:59:20Z

Hola !

When looking at the problem, we saw with @leiyre that some updates need to be done in front.
But, as pointed by @tomaarsen, the green heart and the red heart don't have the same length in python this is why I guess we received an offset of 2 in the prediction of the second sentence of the image.
So a correction in the back is missing too to not have this offset.

cceyda · 2023-03-30T10:36:40Z

yes javascript uses UTF-16 encoding to calculate string lengths. While python counts codepoints(or utf-8 encoding bytes)
The key concepts to understand are unicode code points,graphemes and utf-16 encoding.
I meant to write a more detailed explanation... but I hope the below code will do a better job than words to show what the problem is. Hope this clarifies how chars are handled in python when list("blahblah") or when doing mystring[start_ind:end_ind] and differences with JS

import grapheme
string="👩‍❤️‍💋‍👩🐦한ABதமிழ்💚🤦🏼‍♂️" #example problematic strings I gathered from around the internet.Just need to be composed of more than one unicode codepoint to cause problems.
chars=list(grapheme.graphemes(string))
for char in chars:
    print(f"original:{char}")
    print(f"codepoints:{list(char)}")
    for l in list(char):
        print(l,ord(l))
        for i,b in enumerate(bytes(l,encoding='utf-8')):
            print(i,b)
        print("-----")
    print("#############")

cceyda · 2023-03-30T10:44:10Z

Also learned that in JS if you use array expansion(?) you can get the number of codepoints accurately (same as python)

[..."🤦🏼‍♂️"].length

keithCuniah · 2023-03-30T10:52:37Z

I think you mean Spread operator.
I found this article in connection to your comment, but we need to be carefull with this approach, since for example :

[..."👩‍💻"].length = 3

cceyda · 2023-03-30T11:02:42Z

but that is how python counts too! It count's code points. How humans perceive a single letter(A,B,C etc)(can think of this as the grapheme) and how a single grapheme is represented (by one or more unicode points), and how those points are encoded (UTF-16, UTF-8) are all different things.
Also the problem is not just emojis. It is everything that is represented by more than 1 codepoint (basically some non-english letters, accented chars, emojis) But emojis are an easy way to debug this

also was helpful: https://stackoverflow.com/a/51422499/3726119

On the UI side we want graphemes (using Intl.Segmenter() polyfill can work for this), on the backend we want to calculate codepoints = python. whereas JS natively (just doing "str".length) calculates UTF-16 encoding length.

tomaarsen · 2023-03-30T11:09:08Z

@cceyda If that is indeed consistently equivalent to the Python lengths, then perhaps we can use that to place the spans correctly by only changing the frontend? Or would we strictly need to use e.g. the grapheme library to extract the lengths on the Python side rather than using len(...)?

cceyda · 2023-03-30T11:14:15Z

I would heavily suggest not straying from the norm of using len() list() on the python side (ie counting codepoints), because that is basically how most tokenization libraries work (transformers,spacy... etc). Also grapheme library can become outdated if a new Unicode version gets released. I saw a very old PR for a native grapheme splitter on the cpython library, but who knows when that will be adopted.

We should use graphemes only on the UI side because that is what makes sense to human eyes 👀 😄

github-actions · 2023-08-28T01:46:28Z

This issue is stale because it has been open for 90 days with no activity.

cceyda · 2023-08-28T02:07:52Z

bump as still important

github-actions · 2023-11-29T01:48:00Z

This issue is stale because it has been open for 90 days with no activity.

cceyda · 2023-11-30T05:42:29Z

bump

github-actions · 2024-08-26T01:54:40Z

This issue is stale because it has been open for 90 days with no activity.

github-actions · 2024-11-24T02:07:52Z

This issue was closed because it has been inactive for 30 days since being marked as stale.

cceyda added the type: bug Indicates an unexpected problem or unintended behavior label Feb 16, 2023

cceyda changed the title ~~Token Classification emoji causes overlapping spans error~~ [Bug] Token Classification emojis causes overlapping spans error & wrong annotation index Feb 22, 2023

cceyda changed the title ~~[Bug] Token Classification emojis causes overlapping spans error & wrong annotation index~~ [Bug] Token Classification emojis cause overlapping spans error & wrong annotations Feb 22, 2023

frascuchon added this to the v1.5.0 milestone Mar 8, 2023

frascuchon modified the milestones: v1.5.0, v1.6.0 Mar 20, 2023

frascuchon assigned keithCuniah Mar 21, 2023

keithCuniah mentioned this issue Mar 30, 2023

Fix : emoji length cause overlapping #2621

Closed

19 tasks

frascuchon modified the milestones: v1.6.0, v1.7.0 Apr 9, 2023

frascuchon removed this from the v1.7.0 milestone May 10, 2023

github-actions bot added the status: stale Indicates that there is no activity on an issue or pull request label Aug 28, 2023

github-actions bot removed the status: stale Indicates that there is no activity on an issue or pull request label Aug 31, 2023

github-actions bot added the stale label Nov 29, 2023

keithCuniah removed their assignment Nov 30, 2023

nataliaElv removed the stale label Dec 13, 2023

nataliaElv mentioned this issue Dec 13, 2023

Spans Question argilla-io/roadmap#19

Closed

nataliaElv mentioned this issue Feb 9, 2024

Overlapping Spans argilla-io/roadmap#42

Closed

cceyda mentioned this issue May 15, 2024

[BUG-UI/UX] SpanQuestion char level selection & emoji issues #4834

Closed

github-actions bot added the status: stale Indicates that there is no activity on an issue or pull request label Aug 26, 2024

github-actions bot closed this as completed Nov 24, 2024

cceyda mentioned this issue Dec 4, 2024

[BUG-UI/UX] Emojis cause wrong start:end indexes in SpanQuestion #5729

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Bug] Token Classification emojis cause overlapping spans error & wrong annotations #2353

[Bug] Token Classification emojis cause overlapping spans error & wrong annotations #2353

cceyda commented Feb 16, 2023 •

edited

Loading

tomaarsen commented Feb 16, 2023 •

edited

Loading

cceyda commented Feb 16, 2023

tomaarsen commented Feb 16, 2023

tomaarsen commented Feb 16, 2023 •

edited

Loading

cceyda commented Feb 16, 2023

dvsrepo commented Feb 16, 2023

cceyda commented Feb 16, 2023

leiyre commented Feb 16, 2023

tomaarsen commented Feb 16, 2023

cceyda commented Feb 16, 2023

keithCuniah commented Mar 30, 2023 •

edited

Loading

cceyda commented Mar 30, 2023

cceyda commented Mar 30, 2023 •

edited

Loading

keithCuniah commented Mar 30, 2023

cceyda commented Mar 30, 2023 •

edited

Loading

tomaarsen commented Mar 30, 2023

cceyda commented Mar 30, 2023

github-actions bot commented Aug 28, 2023

cceyda commented Aug 28, 2023

github-actions bot commented Nov 29, 2023

cceyda commented Nov 30, 2023

github-actions bot commented Aug 26, 2024

github-actions bot commented Nov 24, 2024

[Bug] Token Classification emojis cause overlapping spans error & wrong annotations #2353

[Bug] Token Classification emojis cause overlapping spans error & wrong annotations #2353

Comments

cceyda commented Feb 16, 2023 • edited Loading

tomaarsen commented Feb 16, 2023 • edited Loading

cceyda commented Feb 16, 2023

tomaarsen commented Feb 16, 2023

tomaarsen commented Feb 16, 2023 • edited Loading

cceyda commented Feb 16, 2023

dvsrepo commented Feb 16, 2023

cceyda commented Feb 16, 2023

leiyre commented Feb 16, 2023

tomaarsen commented Feb 16, 2023

cceyda commented Feb 16, 2023

keithCuniah commented Mar 30, 2023 • edited Loading

cceyda commented Mar 30, 2023

cceyda commented Mar 30, 2023 • edited Loading

keithCuniah commented Mar 30, 2023

cceyda commented Mar 30, 2023 • edited Loading

tomaarsen commented Mar 30, 2023

cceyda commented Mar 30, 2023

github-actions bot commented Aug 28, 2023

cceyda commented Aug 28, 2023

github-actions bot commented Nov 29, 2023

cceyda commented Nov 30, 2023

github-actions bot commented Aug 26, 2024

github-actions bot commented Nov 24, 2024

cceyda commented Feb 16, 2023 •

edited

Loading

tomaarsen commented Feb 16, 2023 •

edited

Loading

tomaarsen commented Feb 16, 2023 •

edited

Loading

keithCuniah commented Mar 30, 2023 •

edited

Loading

cceyda commented Mar 30, 2023 •

edited

Loading

cceyda commented Mar 30, 2023 •

edited

Loading