Skip to content

Commit 797ae02

Browse files
committed
✨ feat(app): improve text normalization in language detection
Enhanced text normalization by removing newline characters and lowercasing uppercase text to improve prediction accuracy. Added warnings for deprecated parameters and improved configuration management using LangDetectConfig. These changes enhance text preprocessing and ensure better configuration management.
1 parent b43370f commit 797ae02

File tree

5 files changed

+112
-70
lines changed

5 files changed

+112
-70
lines changed

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ In scenarios **where accuracy is important**, you should not rely on the detecti
4343

4444
### Prerequisites
4545

46-
- The "\n" character in the argument string must be removed before calling the function.
4746
- If the sample is too long or too short, the accuracy will be reduced.
4847
- The model will be downloaded to system temporary directory by default. You can customize it by:
4948
- Setting `FTLANG_CACHE` environment variable
@@ -79,7 +78,6 @@ except DetectError as e:
7978
multiline_text = """
8079
Hello, world!
8180
This is a multiline text.
82-
But we need remove \n characters or it will raise a DetectError.
8381
"""
8482
multiline_text = multiline_text.replace("\n", " ")
8583
print(detect(multiline_text))

feature_test/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@
3030
# When offline, its raise error
3131
print(
3232
detect_multilingual(
33-
"Hello, world!你好世界!Привет, мир!", low_memory=False, use_strict_mode=True
33+
"Hello, world!你好世界!Привет, мир!",
34+
low_memory=False,
35+
config=LangDetectConfig(allow_fallback=True)
3436
)
3537
)
3638

src/fast_langdetect/infer.py

Lines changed: 92 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ def __init__(
214214
allow_fallback: bool = True,
215215
disable_verify: bool = False,
216216
verify_hash: Optional[str] = None,
217-
normalize_input: bool = False,
217+
normalize_input: bool = True,
218218
):
219219
self.cache_dir = cache_dir or CACHE_DIRECTORY
220220
self.custom_model_path = custom_model_path
@@ -242,6 +242,35 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
242242
self.config = config or LangDetectConfig()
243243
self._model_loader = ModelLoader()
244244

245+
def _normalize_text(self, text: str, should_normalize: bool = False) -> str:
246+
"""
247+
Normalize text based on configuration.
248+
249+
Currently handles:
250+
- Removing newline characters for better prediction
251+
- Lowercasing uppercase text to prevent misdetection as Japanese
252+
253+
:param text: Input text
254+
:param should_normalize: Whether normalization should be applied
255+
:return: Normalized text
256+
"""
257+
# If not normalization is needed, return the processed text
258+
if not should_normalize:
259+
return text
260+
261+
# Check and record newline and long text
262+
if "\n" in text:
263+
text = text.replace("\n", " ")
264+
265+
# Check if text is all uppercase or mostly uppercase
266+
if text.isupper() or (
267+
len(re.findall(r'[A-Z]', text)) > 0.8 * len(re.findall(r'[A-Za-z]', text))
268+
and len(text) > 5
269+
):
270+
return text.lower()
271+
272+
return text
273+
245274
def _get_model(self, low_memory: bool = True) -> Any:
246275
"""Get or load appropriate model."""
247276
cache_key = "low_memory" if low_memory else "high_memory"
@@ -290,7 +319,18 @@ def detect(
290319
DetectError: If detection fails
291320
"""
292321
model = self._get_model(low_memory)
293-
normalized_text = _normalize_text(text, self.config.normalize_input)
322+
normalized_text = self._normalize_text(text, self.config.normalize_input)
323+
if len(normalized_text) > 100:
324+
logger.warning(
325+
"fast-langdetect: Text is too long. "
326+
"Consider passing only a single sentence for accurate prediction."
327+
)
328+
if "\n" in normalized_text:
329+
logger.warning(
330+
"fast-langdetect: Text contains newline characters. "
331+
"Removing newlines for better prediction accuracy."
332+
)
333+
normalized_text = normalized_text.replace("\n", " ")
294334
try:
295335
labels, scores = model.predict(normalized_text)
296336
return {
@@ -322,7 +362,7 @@ def detect_multilingual(
322362
DetectError: If detection fails
323363
"""
324364
model = self._get_model(low_memory)
325-
normalized_text = _normalize_text(text, self.config.normalize_input)
365+
normalized_text = self._normalize_text(text, self.config.normalize_input)
326366
try:
327367
labels, scores = model.predict(normalized_text, k=k, threshold=threshold)
328368
results = [
@@ -342,66 +382,52 @@ def detect_multilingual(
342382
_default_detector = LangDetector()
343383

344384

345-
def _normalize_text(text: str, should_normalize: bool = False) -> str:
346-
"""
347-
Normalize text based on configuration.
348-
349-
Currently handles:
350-
- Lowercasing uppercase text to prevent misdetection as Japanese
351-
352-
:param text: Input text
353-
:param should_normalize: Whether normalization should be applied
354-
:return: Normalized text
355-
"""
356-
if not should_normalize:
357-
return text
358-
359-
# Check if text is all uppercase (or mostly uppercase)
360-
if text.isupper() or (
361-
len(re.findall(r'[A-Z]', text)) > 0.8 * len(re.findall(r'[A-Za-z]', text))
362-
and len(text) > 5
363-
):
364-
return text.lower()
365-
366-
return text
367-
368-
369385
def detect(
370386
text: str,
371387
*,
372388
low_memory: bool = True,
373389
model_download_proxy: Optional[str] = None,
374390
use_strict_mode: bool = False,
375-
normalize_input: bool = True,
391+
config: Optional[LangDetectConfig] = None,
376392
) -> Dict[str, Union[str, float]]:
377393
"""
378394
Simple interface for language detection.
379-
380-
Before passing a text to this function, you remove all the newline characters.
381-
395+
382396
Too long or too short text will effect the accuracy of the prediction.
383397
384398
:param text: Input text without newline characters
385399
:param low_memory: Whether to use memory-efficient model
386-
:param model_download_proxy: Optional proxy for model download
387-
:param use_strict_mode: Disable fallback to small model
388-
:param normalize_input: Whether to normalize input text (lowercase all-uppercase text)
400+
:param model_download_proxy: [DEPRECATED] Optional proxy for model download
401+
:param use_strict_mode: [DEPRECATED] Disable fallback to small model
402+
:param config: Optional LangDetectConfig object for advanced configuration
389403
390404
:return: Dictionary with language and confidence score
391405
"""
392-
if "\n" in text or len(text) > 1000:
406+
# Provide config
407+
if config is not None:
408+
detector = LangDetector(config)
409+
return detector.detect(text, low_memory=low_memory)
410+
411+
# Check if any custom parameters are provided
412+
has_custom_params = any([
413+
model_download_proxy is not None,
414+
use_strict_mode,
415+
])
416+
if has_custom_params:
417+
# Show warning if using individual parameters
393418
logger.warning(
394-
"fast-langdetect: Text contains newline characters or is too long. "
395-
"You should only pass a single sentence for accurate prediction."
419+
"fast-langdetect: Using individual parameters is deprecated. "
420+
"Consider using LangDetectConfig for better configuration management. "
421+
"Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
396422
)
397-
if model_download_proxy or use_strict_mode or normalize_input:
398-
config = LangDetectConfig(
423+
custom_config = LangDetectConfig(
399424
proxy=model_download_proxy,
400425
allow_fallback=not use_strict_mode,
401-
normalize_input=normalize_input
402426
)
403-
detector = LangDetector(config)
427+
detector = LangDetector(custom_config)
404428
return detector.detect(text, low_memory=low_memory)
429+
430+
# Use default detector
405431
return _default_detector.detect(text, low_memory=low_memory)
406432

407433

@@ -413,40 +439,52 @@ def detect_multilingual(
413439
k: int = 5,
414440
threshold: float = 0.0,
415441
use_strict_mode: bool = False,
416-
normalize_input: bool = True,
442+
config: Optional[LangDetectConfig] = None,
417443
) -> List[Dict[str, Any]]:
418444
"""
419445
Simple interface for multi-language detection.
420446
421-
Before passing a text to this function, you remove all the newline characters.
422-
423447
Too long or too short text will effect the accuracy of the prediction.
424448
425449
:param text: Input text without newline characters
426450
:param low_memory: Whether to use memory-efficient model
427-
:param model_download_proxy: Optional proxy for model download
428451
:param k: Number of top languages to return
429452
:param threshold: Minimum confidence threshold
430-
:param use_strict_mode: Disable fallback to small model
431-
:param normalize_input: Whether to normalize input text (lowercase all-uppercase text)
453+
:param model_download_proxy: [DEPRECATED] Optional proxy for model download
454+
:param use_strict_mode: [DEPRECATED] Disable fallback to small model
455+
:param config: Optional LangDetectConfig object for advanced configuration
432456
433457
:return: List of dictionaries with languages and scores
434458
"""
435-
if "\n" in text or len(text) > 100:
459+
# Use provided config or create new config
460+
if config is not None:
461+
detector = LangDetector(config)
462+
return detector.detect_multilingual(
463+
text, low_memory=low_memory, k=k, threshold=threshold
464+
)
465+
466+
# Check if any custom parameters are provided
467+
has_custom_params = any([
468+
model_download_proxy is not None,
469+
use_strict_mode,
470+
])
471+
if has_custom_params:
472+
# Show warning if using individual parameters
436473
logger.warning(
437-
"fast-langdetect: Text contains newline characters or is too long. "
438-
"You should only pass a single sentence for accurate prediction."
474+
"fast-langdetect: Using individual parameters is deprecated. "
475+
"Consider using LangDetectConfig for better configuration management. "
476+
"Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
439477
)
440-
if model_download_proxy or use_strict_mode or normalize_input:
441-
config = LangDetectConfig(
478+
custom_config = LangDetectConfig(
442479
proxy=model_download_proxy,
443480
allow_fallback=not use_strict_mode,
444-
normalize_input=normalize_input
445481
)
446-
detector = LangDetector(config)
482+
detector = LangDetector(custom_config)
447483
return detector.detect_multilingual(
448484
text, low_memory=low_memory, k=k, threshold=threshold
449485
)
486+
487+
# Use default detector
450488
return _default_detector.detect_multilingual(
451489
text, low_memory=low_memory, k=k, threshold=threshold
452490
)

tests/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ def pytest_configure(config):
44
"""注册自定义标记。"""
55
config.addinivalue_line(
66
"markers",
7-
"slow: 标记需要较长时间运行的测试"
7+
"slow: Run in long progress"
88
)
99
config.addinivalue_line(
1010
"markers",
11-
"real: 标记使用真实模型的测试"
11+
"real: Test with real model"
1212
)

tests/test_detect.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,31 @@
22
# @Time : 2024/1/17 下午5:28
33

44
def test_muti_detect():
5-
from fast_langdetect import detect_multilingual
6-
result = detect_multilingual("hello world", low_memory=True, use_strict_mode=True)
5+
from fast_langdetect import detect_multilingual,LangDetectConfig
6+
result = detect_multilingual(
7+
"hello world",
8+
low_memory=True,
9+
config=LangDetectConfig(allow_fallback=False)
10+
)
711
assert result[0].get("lang") == "en", "ft_detect error"
812
return True
913

1014

1115
def test_large():
12-
from fast_langdetect import detect_multilingual
13-
result = detect_multilingual("hello world", low_memory=True, use_strict_mode=True)
16+
from fast_langdetect import detect_multilingual, LangDetectConfig
17+
result = detect_multilingual("hello world", low_memory=True, config=LangDetectConfig(allow_fallback=False))
1418
assert result[0].get("lang") == "en", "ft_detect error"
15-
result = detect_multilingual("你好世界", low_memory=False, use_strict_mode=True)
19+
result = detect_multilingual("你好世界", low_memory=False, config=LangDetectConfig(allow_fallback=False))
1620
assert result[0].get("lang") == "zh", "ft_detect error"
1721

1822

1923
def test_detect():
20-
from fast_langdetect import detect
21-
assert detect("hello world", low_memory=False, use_strict_mode=True)["lang"] == "en", "ft_detect error"
22-
assert detect("你好世界", low_memory=True, use_strict_mode=True)["lang"] == "zh", "ft_detect error"
23-
assert detect("こんにちは世界", low_memory=False, use_strict_mode=True)["lang"] == "ja", "ft_detect error"
24-
assert detect("안녕하세요 세계", low_memory=True, use_strict_mode=True)["lang"] == "ko", "ft_detect error"
25-
assert detect("Bonjour le monde", low_memory=False, use_strict_mode=True)["lang"] == "fr", "ft_detect error"
24+
from fast_langdetect import detect, LangDetectConfig
25+
assert detect("hello world", low_memory=False, config=LangDetectConfig(allow_fallback=False))["lang"] == "en", "ft_detect error"
26+
assert detect("你好世界", low_memory=True, config=LangDetectConfig(allow_fallback=False))["lang"] == "zh", "ft_detect error"
27+
assert detect("こんにちは世界", low_memory=False, config=LangDetectConfig(allow_fallback=False))["lang"] == "ja", "ft_detect error"
28+
assert detect("안녕하세요 세계", low_memory=True, config=LangDetectConfig(allow_fallback=False))["lang"] == "ko", "ft_detect error"
29+
assert detect("Bonjour le monde", low_memory=False, config=LangDetectConfig(allow_fallback=False))["lang"] == "fr", "ft_detect error"
2630

2731

2832
def test_detect_totally():

0 commit comments

Comments
 (0)