Skip to content

Commit

Permalink
Overload of DetectFromBytes(byte[] bytes, int offset, int len) (#106)
Browse files Browse the repository at this point in the history
* Add an overload of CharsetDetector.DetectFromBytes

* Patch IsStartsWithBom to support offset parameter

* Remove System.Memory reference

* Patch CharsetDetector.FindInputState

* Patch CharsetDetector.FindInputState (2)

* Fix duplicated offset from the index base

* Doc/Msg improvement for DetectFromBytes overload

Added BOM offset info to the docs of DetectFromBytes(byte[], int, int)
Improvme exception message of DetectFromBytes(byte[], int, int)
  • Loading branch information
ied206 authored and 304NotModified committed Jan 26, 2020
1 parent d52af8d commit f1aa5fd
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 19 deletions.
69 changes: 51 additions & 18 deletions src/CharsetDetector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,9 @@ private CharsetDetector()

/// <summary>
/// Detect the character encoding form this byte array.
/// It searchs for BOM from bytes[0].
/// </summary>
/// <param name="bytes"></param>
/// <param name="bytes">The byte array containing the text</param>
/// <returns></returns>
public static DetectionResult DetectFromBytes(byte[] bytes)
{
Expand All @@ -133,6 +134,38 @@ public static DetectionResult DetectFromBytes(byte[] bytes)
return detector.DataEnd();
}

/// <summary>
/// Detect the character encoding form this byte array.
/// It searchs for BOM from bytes[offset].
/// </summary>
/// <param name="bytes">The byte array containing the text</param>
/// <param name="offset">The zero-based byte offset in buffer at which to begin reading the data from</param>
/// <param name="len">The maximum number of bytes to be read</param>
/// <returns></returns>
public static DetectionResult DetectFromBytes(byte[] bytes, int offset, int len)
{
if (bytes == null)
{
throw new ArgumentNullException(nameof(bytes));
}
if (offset < 0)
{
throw new ArgumentOutOfRangeException(nameof(offset));
}
if (len < 0)
{
throw new ArgumentOutOfRangeException(nameof(len));
}
if (bytes.Length < offset + len)
{
throw new ArgumentException($"{nameof(len)} is greater than the number of bytes from {nameof(offset)} to the end of the array.");
}

var detector = new CharsetDetector();
detector.Feed(bytes, offset, len);
return detector.DataEnd();
}

#if !NETSTANDARD1_0

/// <summary>
Expand Down Expand Up @@ -270,12 +303,12 @@ protected virtual void Feed(byte[] buf, int offset, int len)
if (_start)
{
_start = false;
_done = IsStartsWithBom(buf, len);
_done = IsStartsWithBom(buf, offset, len);
if (_done)
return;
}

FindInputState(buf, len);
FindInputState(buf, offset, len);
foreach (var prober in CharsetProbers)
{
_done = RunProber(buf, offset, len, prober);
Expand All @@ -284,9 +317,9 @@ protected virtual void Feed(byte[] buf, int offset, int len)
}
}

private bool IsStartsWithBom(byte[] buf, int len)
private bool IsStartsWithBom(byte[] buf, int offset, int len)
{
var bomSet = FindCharSetByBom(buf, len);
var bomSet = FindCharSetByBom(buf, offset, len);
if (bomSet != null)
{
_detectionDetail = new DetectionDetail(bomSet, 1.0f);
Expand All @@ -306,9 +339,9 @@ private bool RunProber(byte[] buf, int offset, int len, CharsetProber charsetPro
return false;
}

private void FindInputState(byte[] buf, int len)
private void FindInputState(byte[] buf, int offset, int len)
{
for (int i = 0; i < len; i++)
for (int i = offset; i < len; i++)
{
// other than 0xa0, if every other character is ascii, the page is ascii
if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
Expand Down Expand Up @@ -337,35 +370,35 @@ private void FindInputState(byte[] buf, int len)
}
}

private static string FindCharSetByBom(byte[] buf, int len)
private static string FindCharSetByBom(byte[] buf, int offset, int len)
{
if (len < 2)
return null;

var buf0 = buf[0];
var buf1 = buf[1];
var buf0 = buf[offset + 0];
var buf1 = buf[offset + 1];

if (buf0 == 0xFE && buf1 == 0xFF)
{
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
return len > 3
&& buf[2] == 0x00 && buf[3] == 0x00
&& buf[offset + 2] == 0x00 && buf[offset + 3] == 0x00
? CodepageName.X_ISO_10646_UCS_4_3412
: CodepageName.UTF16_BE;
}

if (buf0 == 0xFF && buf1 == 0xFE)
{
return len > 3
&& buf[2] == 0x00 && buf[3] == 0x00
&& buf[offset + 2] == 0x00 && buf[offset + 3] == 0x00
? CodepageName.UTF32_LE
: CodepageName.UTF16_LE;
}

if (len < 3)
return null;

if (buf0 == 0xEF && buf1 == 0xBB && buf[2] == 0xBF)
if (buf0 == 0xEF && buf1 == 0xBB && buf[offset + 2] == 0xBF)
return CodepageName.UTF8;

if (len < 4)
Expand All @@ -374,22 +407,22 @@ private static string FindCharSetByBom(byte[] buf, int len)
//Here, because anyway further more than 3 positions are checked.
if (buf0 == 0x00 && buf1 == 0x00)
{
if (buf[2] == 0xFE && buf[3] == 0xFF)
if (buf[offset + 2] == 0xFE && buf[offset + 3] == 0xFF)
return CodepageName.UTF32_BE;

// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
if (buf[2] == 0xFF && buf[3] == 0xFE)
if (buf[offset + 2] == 0xFF && buf[offset + 3] == 0xFE)
return CodepageName.X_ISO_10646_UCS_4_2143;
}

// Detect utf-7 with bom (see table in https://en.wikipedia.org/wiki/Byte_order_mark)
if (buf0 == 0x2B && buf1 == 0x2F && buf[2] == 0x76)
if (buf[3] == 0x38 || buf[3] == 0x39 || buf[3] == 0x2B || buf[3] == 0x2F)
if (buf0 == 0x2B && buf1 == 0x2F && buf[offset + 2] == 0x76)
if (buf[offset + 3] == 0x38 || buf[offset + 3] == 0x39 || buf[offset + 3] == 0x2B || buf[offset + 3] == 0x2F)
return CodepageName.UTF7;

// Detect GB18030 with bom (see table in https://en.wikipedia.org/wiki/Byte_order_mark)
// TODO: If you remove this check, GB18030Prober will still be defined as GB18030 -- It's feature or bug?
if (buf0 == 0x84 && buf1 == 0x31 && buf[2] == 0x95 && buf[3] == 0x33)
if (buf0 == 0x84 && buf1 == 0x31 && buf[offset + 2] == 0x95 && buf[offset + 3] == 0x33)
return CodepageName.GB18030;

return null;
Expand Down
21 changes: 20 additions & 1 deletion tests/CharsetDetectorTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,26 @@ public void DetectFromStreamMaxBytes(int? maxBytes, int expectedPosition, int st
// Assert
Assert.AreEqual(expectedPosition, stream.Position);
}


[Test]
[TestCase(0, 10, CodepageName.ASCII)]
[TestCase(0, 100, CodepageName.UTF8)]
[TestCase(10, 100, CodepageName.UTF8)]
public void DetectFromByteArray(int offset, int len, string detectedCodepage)
{
// Arrange
string s = "UTF-Unknown은 파일, 스트림, 그 외 바이트 배열의 캐릭터 셋을 탐지하는 라이브러리입니다." +
"대한민국 (大韓民國, Republic of Korea)";
byte[] bytes = Encoding.UTF8.GetBytes(s);

// Act
var result = CharsetDetector.DetectFromBytes(bytes, offset, len);

// Assert
Assert.AreEqual(detectedCodepage, result.Detected.EncodingName);
Assert.AreEqual(1.0f, result.Detected.Confidence);
}

[Test]
[TestCase(new byte[] { 0x2B, 0x2F, 0x76, 0x38 })]
[TestCase(new byte[] { 0x2B, 0x2F, 0x76, 0x39 })]
Expand Down

0 comments on commit f1aa5fd

Please sign in to comment.