Skip to content

Commit

Permalink
Add custom detection function for language detection
Browse files Browse the repository at this point in the history
  • Loading branch information
saucam committed Feb 25, 2023
1 parent 472ef54 commit eb7f5ed
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ __pycache__/
.coverage
.coverage.*
*.pyc
.vscode/
6 changes: 5 additions & 1 deletion src/python/txtai/pipeline/text/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,17 +114,21 @@ def modelids(self):
ids = [x.modelId for x in HfApi().list_models(author="Helsinki-NLP")] if self.findmodels else []
return set(ids)

def detect(self, texts):
def detect(self, texts, custom_detect=None):
"""
Detects the language for each element in texts.
Args:
texts: list of text
custom_detect: A function to use custom model to detect language
Returns:
list of languages
"""

if custom_detect is not None:
return custom_detect(texts)

if not FASTTEXT:
raise ImportError('Language detection is not available - install "pipeline" extra to enable')

Expand Down
25 changes: 25 additions & 0 deletions test/python/testpipeline/testtranslation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,31 @@ class TestTranslation(unittest.TestCase):
Translation tests.
"""

def testDetect(self):
"""
Test language detection
"""
translate = Translation()

test = ["This is a test language detection."]
language = translate.detect(test)

self.assertListEqual(language, ["en"])

def testDetectWithCustomFunc(self):
"""
Test language detection with custom function
"""
translate = Translation()

def dummy_func(text):
return ["en" for x in text]

test = ["This is a test language detection."]
language = translate.detect(test, dummy_func)

self.assertListEqual(language, ["en"])

def testLongTranslation(self):
"""
Test a translation longer than max tokenization length
Expand Down

0 comments on commit eb7f5ed

Please sign in to comment.