Skip to content

Commit

Permalink
🔧 improve the detector general reliability (#532)
Browse files Browse the repository at this point in the history
Issues (#520) (#509) (#498) (#407)
  • Loading branch information
Ousret authored Sep 25, 2024
1 parent 0d694f0 commit 39b6f5c
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 14 deletions.
6 changes: 2 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [3.3.3](https://github.com/Ousret/charset_normalizer/compare/3.3.2...master) (2024-03-??)
## [3.3.3](https://github.com/Ousret/charset_normalizer/compare/3.3.2...master) (2024-09-??)

### Fixed
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.

### Changed
- Optional mypyc compilation upgraded to version 1.9.0 for Python >= 3.8
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407)

## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)

Expand Down
16 changes: 10 additions & 6 deletions charset_normalizer/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,16 +221,20 @@ def from_bytes(
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)],
(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)]
),
encoding=encoding_iana,
)
else:
decoded_payload = str(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :],
(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :]
),
encoding=encoding_iana,
)
except (UnicodeDecodeError, LookupError) as e:
Expand Down
2 changes: 2 additions & 0 deletions charset_normalizer/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,8 @@
"|",
'"',
"-",
"(",
")",
}


Expand Down
19 changes: 16 additions & 3 deletions charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def reset(self) -> None: # pragma: no cover

@property
def ratio(self) -> float:
if self._character_count <= 24:
if self._character_count <= 13:
return 0.0

ratio_of_suspicious_range_usage: float = (
Expand All @@ -260,6 +260,7 @@ def __init__(self) -> None:

self._buffer: str = ""
self._buffer_accent_count: int = 0
self._buffer_glyph_count: int = 0

def eligible(self, character: str) -> bool:
return True
Expand All @@ -279,6 +280,14 @@ def feed(self, character: str) -> None:
and is_thai(character) is False
):
self._foreign_long_watch = True
if (
is_cjk(character)
or is_hangul(character)
or is_katakana(character)
or is_hiragana(character)
or is_thai(character)
):
self._buffer_glyph_count += 1
return
if not self._buffer:
return
Expand All @@ -291,17 +300,20 @@ def feed(self, character: str) -> None:
self._character_count += buffer_length

if buffer_length >= 4:
if self._buffer_accent_count / buffer_length > 0.34:
if self._buffer_accent_count / buffer_length >= 0.5:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
if (
elif (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
elif self._buffer_glyph_count == 1:
self._is_current_word_bad = True
self._foreign_long_count += 1
if buffer_length >= 24 and self._foreign_long_watch:
camel_case_dst = [
i
Expand All @@ -325,6 +337,7 @@ def feed(self, character: str) -> None:
self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
self._buffer_glyph_count = 0
elif (
character not in {"<", ">", "-", "=", "~", "|", "_"}
and character.isdigit() is False
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "3.3.2"
__version__ = "3.3.3"
VERSION = __version__.split(".")

0 comments on commit 39b6f5c

Please sign in to comment.