@@ -214,7 +214,7 @@ def __init__(
214
214
allow_fallback : bool = True ,
215
215
disable_verify : bool = False ,
216
216
verify_hash : Optional [str ] = None ,
217
- normalize_input : bool = False ,
217
+ normalize_input : bool = True ,
218
218
):
219
219
self .cache_dir = cache_dir or CACHE_DIRECTORY
220
220
self .custom_model_path = custom_model_path
@@ -242,6 +242,35 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
242
242
self .config = config or LangDetectConfig ()
243
243
self ._model_loader = ModelLoader ()
244
244
245
+ def _normalize_text (self , text : str , should_normalize : bool = False ) -> str :
246
+ """
247
+ Normalize text based on configuration.
248
+
249
+ Currently handles:
250
+ - Removing newline characters for better prediction
251
+ - Lowercasing uppercase text to prevent misdetection as Japanese
252
+
253
+ :param text: Input text
254
+ :param should_normalize: Whether normalization should be applied
255
+ :return: Normalized text
256
+ """
257
+ # If not normalization is needed, return the processed text
258
+ if not should_normalize :
259
+ return text
260
+
261
+ # Check and record newline and long text
262
+ if "\n " in text :
263
+ text = text .replace ("\n " , " " )
264
+
265
+ # Check if text is all uppercase or mostly uppercase
266
+ if text .isupper () or (
267
+ len (re .findall (r'[A-Z]' , text )) > 0.8 * len (re .findall (r'[A-Za-z]' , text ))
268
+ and len (text ) > 5
269
+ ):
270
+ return text .lower ()
271
+
272
+ return text
273
+
245
274
def _get_model (self , low_memory : bool = True ) -> Any :
246
275
"""Get or load appropriate model."""
247
276
cache_key = "low_memory" if low_memory else "high_memory"
@@ -290,7 +319,18 @@ def detect(
290
319
DetectError: If detection fails
291
320
"""
292
321
model = self ._get_model (low_memory )
293
- normalized_text = _normalize_text (text , self .config .normalize_input )
322
+ normalized_text = self ._normalize_text (text , self .config .normalize_input )
323
+ if len (normalized_text ) > 100 :
324
+ logger .warning (
325
+ "fast-langdetect: Text is too long. "
326
+ "Consider passing only a single sentence for accurate prediction."
327
+ )
328
+ if "\n " in normalized_text :
329
+ logger .warning (
330
+ "fast-langdetect: Text contains newline characters. "
331
+ "Removing newlines for better prediction accuracy."
332
+ )
333
+ normalized_text = normalized_text .replace ("\n " , " " )
294
334
try :
295
335
labels , scores = model .predict (normalized_text )
296
336
return {
@@ -322,7 +362,7 @@ def detect_multilingual(
322
362
DetectError: If detection fails
323
363
"""
324
364
model = self ._get_model (low_memory )
325
- normalized_text = _normalize_text (text , self .config .normalize_input )
365
+ normalized_text = self . _normalize_text (text , self .config .normalize_input )
326
366
try :
327
367
labels , scores = model .predict (normalized_text , k = k , threshold = threshold )
328
368
results = [
@@ -342,66 +382,52 @@ def detect_multilingual(
342
382
_default_detector = LangDetector ()
343
383
344
384
345
- def _normalize_text (text : str , should_normalize : bool = False ) -> str :
346
- """
347
- Normalize text based on configuration.
348
-
349
- Currently handles:
350
- - Lowercasing uppercase text to prevent misdetection as Japanese
351
-
352
- :param text: Input text
353
- :param should_normalize: Whether normalization should be applied
354
- :return: Normalized text
355
- """
356
- if not should_normalize :
357
- return text
358
-
359
- # Check if text is all uppercase (or mostly uppercase)
360
- if text .isupper () or (
361
- len (re .findall (r'[A-Z]' , text )) > 0.8 * len (re .findall (r'[A-Za-z]' , text ))
362
- and len (text ) > 5
363
- ):
364
- return text .lower ()
365
-
366
- return text
367
-
368
-
369
385
def detect (
370
386
text : str ,
371
387
* ,
372
388
low_memory : bool = True ,
373
389
model_download_proxy : Optional [str ] = None ,
374
390
use_strict_mode : bool = False ,
375
- normalize_input : bool = True ,
391
+ config : Optional [ LangDetectConfig ] = None ,
376
392
) -> Dict [str , Union [str , float ]]:
377
393
"""
378
394
Simple interface for language detection.
379
-
380
- Before passing a text to this function, you remove all the newline characters.
381
-
395
+
382
396
Too long or too short text will effect the accuracy of the prediction.
383
397
384
398
:param text: Input text without newline characters
385
399
:param low_memory: Whether to use memory-efficient model
386
- :param model_download_proxy: Optional proxy for model download
387
- :param use_strict_mode: Disable fallback to small model
388
- :param normalize_input: Whether to normalize input text (lowercase all-uppercase text)
400
+ :param model_download_proxy: [DEPRECATED] Optional proxy for model download
401
+ :param use_strict_mode: [DEPRECATED] Disable fallback to small model
402
+ :param config: Optional LangDetectConfig object for advanced configuration
389
403
390
404
:return: Dictionary with language and confidence score
391
405
"""
392
- if "\n " in text or len (text ) > 1000 :
406
+ # Provide config
407
+ if config is not None :
408
+ detector = LangDetector (config )
409
+ return detector .detect (text , low_memory = low_memory )
410
+
411
+ # Check if any custom parameters are provided
412
+ has_custom_params = any ([
413
+ model_download_proxy is not None ,
414
+ use_strict_mode ,
415
+ ])
416
+ if has_custom_params :
417
+ # Show warning if using individual parameters
393
418
logger .warning (
394
- "fast-langdetect: Text contains newline characters or is too long. "
395
- "You should only pass a single sentence for accurate prediction."
419
+ "fast-langdetect: Using individual parameters is deprecated. "
420
+ "Consider using LangDetectConfig for better configuration management. "
421
+ "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
396
422
)
397
- if model_download_proxy or use_strict_mode or normalize_input :
398
- config = LangDetectConfig (
423
+ custom_config = LangDetectConfig (
399
424
proxy = model_download_proxy ,
400
425
allow_fallback = not use_strict_mode ,
401
- normalize_input = normalize_input
402
426
)
403
- detector = LangDetector (config )
427
+ detector = LangDetector (custom_config )
404
428
return detector .detect (text , low_memory = low_memory )
429
+
430
+ # Use default detector
405
431
return _default_detector .detect (text , low_memory = low_memory )
406
432
407
433
@@ -413,40 +439,52 @@ def detect_multilingual(
413
439
k : int = 5 ,
414
440
threshold : float = 0.0 ,
415
441
use_strict_mode : bool = False ,
416
- normalize_input : bool = True ,
442
+ config : Optional [ LangDetectConfig ] = None ,
417
443
) -> List [Dict [str , Any ]]:
418
444
"""
419
445
Simple interface for multi-language detection.
420
446
421
- Before passing a text to this function, you remove all the newline characters.
422
-
423
447
Too long or too short text will effect the accuracy of the prediction.
424
448
425
449
:param text: Input text without newline characters
426
450
:param low_memory: Whether to use memory-efficient model
427
- :param model_download_proxy: Optional proxy for model download
428
451
:param k: Number of top languages to return
429
452
:param threshold: Minimum confidence threshold
430
- :param use_strict_mode: Disable fallback to small model
431
- :param normalize_input: Whether to normalize input text (lowercase all-uppercase text)
453
+ :param model_download_proxy: [DEPRECATED] Optional proxy for model download
454
+ :param use_strict_mode: [DEPRECATED] Disable fallback to small model
455
+ :param config: Optional LangDetectConfig object for advanced configuration
432
456
433
457
:return: List of dictionaries with languages and scores
434
458
"""
435
- if "\n " in text or len (text ) > 100 :
459
+ # Use provided config or create new config
460
+ if config is not None :
461
+ detector = LangDetector (config )
462
+ return detector .detect_multilingual (
463
+ text , low_memory = low_memory , k = k , threshold = threshold
464
+ )
465
+
466
+ # Check if any custom parameters are provided
467
+ has_custom_params = any ([
468
+ model_download_proxy is not None ,
469
+ use_strict_mode ,
470
+ ])
471
+ if has_custom_params :
472
+ # Show warning if using individual parameters
436
473
logger .warning (
437
- "fast-langdetect: Text contains newline characters or is too long. "
438
- "You should only pass a single sentence for accurate prediction."
474
+ "fast-langdetect: Using individual parameters is deprecated. "
475
+ "Consider using LangDetectConfig for better configuration management. "
476
+ "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
439
477
)
440
- if model_download_proxy or use_strict_mode or normalize_input :
441
- config = LangDetectConfig (
478
+ custom_config = LangDetectConfig (
442
479
proxy = model_download_proxy ,
443
480
allow_fallback = not use_strict_mode ,
444
- normalize_input = normalize_input
445
481
)
446
- detector = LangDetector (config )
482
+ detector = LangDetector (custom_config )
447
483
return detector .detect_multilingual (
448
484
text , low_memory = low_memory , k = k , threshold = threshold
449
485
)
486
+
487
+ # Use default detector
450
488
return _default_detector .detect_multilingual (
451
489
text , low_memory = low_memory , k = k , threshold = threshold
452
490
)
0 commit comments