Skip to content

Commit

Permalink
[text analytics] add domain_filter param (Azure#13451)
Browse files Browse the repository at this point in the history
  • Loading branch information
iscai-msft authored and rakshith91 committed Sep 4, 2020
1 parent aacbd60 commit fc02f42
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
OpinionSentiment,
RecognizePiiEntitiesResult,
PiiEntity,
PiiEntityDomainType,
)

__all__ = [
Expand Down Expand Up @@ -59,6 +60,7 @@
'OpinionSentiment',
'RecognizePiiEntitiesResult',
'PiiEntity',
'PiiEntityDomainType',
]

__version__ = VERSION
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Licensed under the MIT License.
# ------------------------------------
import re
from enum import Enum
from ._generated.models import (
LanguageInput,
MultiLanguageInput,
Expand Down Expand Up @@ -64,6 +65,10 @@ def get(self, key, default=None):
return self.__dict__[key]
return default

class PiiEntityDomainType(str, Enum):
"""The different domains of PII entities that users can filter by"""
PROTECTED_HEALTH_INFORMATION = "PHI" # See https://aka.ms/tanerpii for more information.


class DetectedLanguage(DictMixin):
"""DetectedLanguage contains the predicted language found in text,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,10 @@ def recognize_pii_entities( # type: ignore
be used for scoring, e.g. "latest", "2019-10-01". If a model-version
is not specified, the API will default to the latest, non-preview version.
:keyword bool show_stats: If set to true, response will contain document level statistics.
:keyword domain_filter: Filters the response entities to ones only included in the specified domain.
I.e., if set to 'PHI', will only return entities in the Protected Healthcare Information domain.
See https://aka.ms/tanerpii for more information.
:paramtype domain_filter: str or ~azure.ai.textanalytics.PiiEntityDomainType
:return: The combined list of :class:`~azure.ai.textanalytics.RecognizePiiEntitiesResult`
and :class:`~azure.ai.textanalytics.DocumentError` in the order the original documents
were passed in.
Expand All @@ -281,13 +285,15 @@ def recognize_pii_entities( # type: ignore
docs = _validate_input(documents, "language", language)
model_version = kwargs.pop("model_version", None)
show_stats = kwargs.pop("show_stats", False)
domain_filter = kwargs.pop("domain_filter", None)
if self._string_code_unit:
kwargs.update({"string_index_type": self._string_code_unit})
try:
return self._client.entities_recognition_pii(
documents=docs,
model_version=model_version,
show_stats=show_stats,
domain=domain_filter,
cls=kwargs.pop("cls", pii_entities_result),
**kwargs
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ async def recognize_pii_entities( # type: ignore
be used for scoring, e.g. "latest", "2019-10-01". If a model-version
is not specified, the API will default to the latest, non-preview version.
:keyword bool show_stats: If set to true, response will contain document level statistics.
:keyword domain_filter: Filters the response entities to ones only included in the specified domain.
I.e., if set to 'PHI', will only return entities in the Protected Healthcare Information domain.
See https://aka.ms/tanerpii for more information.
:paramtype domain_filter: str or ~azure.ai.textanalytics.PiiEntityDomainType
:return: The combined list of :class:`~azure.ai.textanalytics.RecognizePiiEntitiesResult`
and :class:`~azure.ai.textanalytics.DocumentError` in the order the original documents
were passed in.
Expand All @@ -283,13 +287,16 @@ async def recognize_pii_entities( # type: ignore
docs = _validate_input(documents, "language", language)
model_version = kwargs.pop("model_version", None)
show_stats = kwargs.pop("show_stats", False)
domain_filter = kwargs.pop("domain_filter", None)

if self._string_code_unit:
kwargs.update({"string_index_type": self._string_code_unit})
try:
return await self._client.entities_recognition_pii(
documents=docs,
model_version=model_version,
show_stats=show_stats,
domain=domain_filter,
cls=kwargs.pop("cls", pii_entities_result),
**kwargs
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "I work at Microsoft and my phone number
is 333-333-3333", "language": "en"}]}'
headers:
Accept:
- application/json, text/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '113'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&domain=PHI&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"id":"0","entities":[{"text":"333-333-3333","category":"Phone
Number","offset":43,"length":12,"confidenceScore":0.8}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id:
- c2319b95-6fd2-46c9-80e3-06c8f2701825
content-type:
- application/json; charset=utf-8
csp-billing-usage:
- CognitiveServices.TextAnalytics.BatchScoring=1
date:
- Mon, 31 Aug 2020 20:32:54 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '79'
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "I work at Microsoft and my phone number
is 333-333-3333", "language": "en"}]}'
headers:
Accept:
- application/json, text/json
Content-Length:
- '113'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&domain=PHI&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"id":"0","entities":[{"text":"333-333-3333","category":"Phone
Number","offset":43,"length":12,"confidenceScore":0.8}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id: 9265752d-3262-4dbb-94d6-be26889e3db9
content-type: application/json; charset=utf-8
csp-billing-usage: CognitiveServices.TextAnalytics.BatchScoring=1
date: Mon, 31 Aug 2020 20:32:55 GMT
strict-transport-security: max-age=31536000; includeSubDomains; preload
transfer-encoding: chunked
x-content-type-options: nosniff
x-envoy-upstream-service-time: '82'
status:
code: 200
message: OK
url: https://westus2.api.cognitive.microsoft.com//text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&domain=PHI&stringIndexType=UnicodeCodePoint
version: 1
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
TextDocumentInput,
VERSION,
TextAnalyticsApiVersion,
PiiEntityDomainType,
)

# pre-apply the client_cls positional argument so it needn't be explicitly passed below
Expand Down Expand Up @@ -573,4 +574,17 @@ def test_recognize_pii_entities_v3(self, client):
with pytest.raises(NotImplementedError) as excinfo:
client.recognize_pii_entities(["this should fail"])

assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)
assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
def test_phi_domain_filter(self, client):
# without the domain filter, this should return two entities: Microsoft as an org,
# and the phone number. With the domain filter, it should only return one.
result = client.recognize_pii_entities(
["I work at Microsoft and my phone number is 333-333-3333"],
domain_filter=PiiEntityDomainType.PROTECTED_HEALTH_INFORMATION
)
self.assertEqual(len(result[0].entities), 1)
self.assertEqual(result[0].entities[0].text, '333-333-3333')
self.assertEqual(result[0].entities[0].category, 'Phone Number')
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
TextDocumentInput,
VERSION,
TextAnalyticsApiVersion,
PiiEntityDomainType,
)

# pre-apply the client_cls positional argument so it needn't be explicitly passed below
Expand Down Expand Up @@ -572,3 +573,16 @@ async def test_recognize_pii_entities_v3(self, client):
await client.recognize_pii_entities(["this should fail"])

assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
async def test_phi_domain_filter(self, client):
# without the domain filter, this should return two entities: Microsoft as an org,
# and the phone number. With the domain filter, it should only return one.
result = await client.recognize_pii_entities(
["I work at Microsoft and my phone number is 333-333-3333"],
domain_filter=PiiEntityDomainType.PROTECTED_HEALTH_INFORMATION
)
self.assertEqual(len(result[0].entities), 1)
self.assertEqual(result[0].entities[0].text, '333-333-3333')
self.assertEqual(result[0].entities[0].category, 'Phone Number')

0 comments on commit fc02f42

Please sign in to comment.