Overload of DetectFromBytes(byte[] bytes, int offset, int len) (#106)

* Add an overload of CharsetDetector.DetectFromBytes * Patch IsStartsWithBom to support offset parameter * Remove System.Memory reference * Patch CharsetDetector.FindInputState * Patch CharsetDetector.FindInputState (2) * Fix duplicated offset from the index base * Doc/Msg improvement for DetectFromBytes overload Added BOM offset info to the docs of DetectFromBytes(byte[], int, int) Improvme exception message of DetectFromBytes(byte[], int, int)
CharsetDetector · Jan 26, 2020 · f1aa5fd · f1aa5fd
1 parent d52af8d
commit f1aa5fd
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 19 deletions.
diff --git a/src/CharsetDetector.cs b/src/CharsetDetector.cs
@@ -118,8 +118,9 @@ private CharsetDetector()
 
         /// <summary>
         /// Detect the character encoding form this byte array.
+        /// It searchs for BOM from bytes[0].
         /// </summary>
-        /// <param name="bytes"></param>
+        /// <param name="bytes">The byte array containing the text</param>
         /// <returns></returns>
         public static DetectionResult DetectFromBytes(byte[] bytes)
         {
@@ -133,6 +134,38 @@ public static DetectionResult DetectFromBytes(byte[] bytes)
             return detector.DataEnd();
         }
 
+        /// <summary>
+        /// Detect the character encoding form this byte array. 
+        /// It searchs for BOM from bytes[offset].
+        /// </summary>
+        /// <param name="bytes">The byte array containing the text</param>
+        /// <param name="offset">The zero-based byte offset in buffer at which to begin reading the data from</param>
+        /// <param name="len">The maximum number of bytes to be read</param>
+        /// <returns></returns>
+        public static DetectionResult DetectFromBytes(byte[] bytes, int offset, int len)
+        {
+            if (bytes == null)
+            {
+                throw new ArgumentNullException(nameof(bytes));
+            }
+            if (offset < 0)
+            {
+                throw new ArgumentOutOfRangeException(nameof(offset));
+            }
+            if (len < 0)
+            {
+                throw new ArgumentOutOfRangeException(nameof(len));
+            }
+            if (bytes.Length < offset + len)
+            {
+                throw new ArgumentException($"{nameof(len)} is greater than the number of bytes from {nameof(offset)} to the end of the array.");
+            }
+
+            var detector = new CharsetDetector();
+            detector.Feed(bytes, offset, len);
+            return detector.DataEnd();
+        }
+
 #if !NETSTANDARD1_0
 
         /// <summary>
@@ -270,12 +303,12 @@ protected virtual void Feed(byte[] buf, int offset, int len)
             if (_start)
             {
                 _start = false;
-                _done = IsStartsWithBom(buf, len);
+                _done = IsStartsWithBom(buf, offset, len);
                 if (_done)
                     return;
             }
 
-            FindInputState(buf, len);
+            FindInputState(buf, offset, len);
             foreach (var prober in CharsetProbers)
             {
                 _done = RunProber(buf, offset, len, prober);
@@ -284,9 +317,9 @@ protected virtual void Feed(byte[] buf, int offset, int len)
             }
         }
 
-        private bool IsStartsWithBom(byte[] buf, int len)
+        private bool IsStartsWithBom(byte[] buf, int offset, int len)
         {
-            var bomSet = FindCharSetByBom(buf, len);
+            var bomSet = FindCharSetByBom(buf, offset, len);
             if (bomSet != null)
             {
                 _detectionDetail = new DetectionDetail(bomSet, 1.0f);
@@ -306,9 +339,9 @@ private bool RunProber(byte[] buf, int offset, int len, CharsetProber charsetPro
             return false;
         }
 
-        private void FindInputState(byte[] buf, int len)
+        private void FindInputState(byte[] buf, int offset, int len)
         {
-            for (int i = 0; i < len; i++)
+            for (int i = offset; i < len; i++)
             {
                 // other than 0xa0, if every other character is ascii, the page is ascii
                 if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
@@ -337,35 +370,35 @@ private void FindInputState(byte[] buf, int len)
             }
         }
 
-        private static string FindCharSetByBom(byte[] buf, int len)
+        private static string FindCharSetByBom(byte[] buf, int offset, int len)
         {
             if (len < 2)
                 return null;
 
-            var buf0 = buf[0];
-            var buf1 = buf[1];
+            var buf0 = buf[offset + 0];
+            var buf1 = buf[offset + 1];
 
             if (buf0 == 0xFE && buf1 == 0xFF)
             {
                 // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                 return len > 3
-                        && buf[2] == 0x00 && buf[3] == 0x00
+                        && buf[offset + 2] == 0x00 && buf[offset + 3] == 0x00
                     ? CodepageName.X_ISO_10646_UCS_4_3412
                     : CodepageName.UTF16_BE;
             }
 
             if (buf0 == 0xFF && buf1 == 0xFE)
             {
                 return len > 3
-                       && buf[2] == 0x00 && buf[3] == 0x00
+                       && buf[offset + 2] == 0x00 && buf[offset + 3] == 0x00
                     ? CodepageName.UTF32_LE
                     : CodepageName.UTF16_LE;
             }
 
             if (len < 3)
                 return null;
 
-            if (buf0 == 0xEF && buf1 == 0xBB && buf[2] == 0xBF)
+            if (buf0 == 0xEF && buf1 == 0xBB && buf[offset + 2] == 0xBF)
                 return CodepageName.UTF8;
 
             if (len < 4)
@@ -374,22 +407,22 @@ private static string FindCharSetByBom(byte[] buf, int len)
             //Here, because anyway further more than 3 positions are checked.
             if (buf0 == 0x00 && buf1 == 0x00)
             {
-                if (buf[2] == 0xFE && buf[3] == 0xFF)
+                if (buf[offset + 2] == 0xFE && buf[offset + 3] == 0xFF)
                     return CodepageName.UTF32_BE;
 
                 // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
-                if (buf[2] == 0xFF && buf[3] == 0xFE)
+                if (buf[offset + 2] == 0xFF && buf[offset + 3] == 0xFE)
                     return CodepageName.X_ISO_10646_UCS_4_2143;
             }
 
             // Detect utf-7 with bom (see table in https://en.wikipedia.org/wiki/Byte_order_mark)
-            if (buf0 == 0x2B && buf1 == 0x2F && buf[2] == 0x76)
-                if (buf[3] == 0x38 || buf[3] == 0x39 || buf[3] == 0x2B || buf[3] == 0x2F)
+            if (buf0 == 0x2B && buf1 == 0x2F && buf[offset + 2] == 0x76)
+                if (buf[offset + 3] == 0x38 || buf[offset + 3] == 0x39 || buf[offset + 3] == 0x2B || buf[offset + 3] == 0x2F)
                     return CodepageName.UTF7;
 
             // Detect GB18030 with bom (see table in https://en.wikipedia.org/wiki/Byte_order_mark)
             // TODO: If you remove this check, GB18030Prober will still be defined as GB18030 -- It's feature or bug?
-            if (buf0 == 0x84 && buf1 == 0x31 && buf[2] == 0x95 && buf[3] == 0x33)
+            if (buf0 == 0x84 && buf1 == 0x31 && buf[offset + 2] == 0x95 && buf[offset + 3] == 0x33)
                 return CodepageName.GB18030;
 
             return null;

diff --git a/tests/CharsetDetectorTest.cs b/tests/CharsetDetectorTest.cs
@@ -59,7 +59,26 @@ public void DetectFromStreamMaxBytes(int? maxBytes, int expectedPosition, int st
             // Assert
             Assert.AreEqual(expectedPosition, stream.Position);
         }
-
+
+        [Test]
+        [TestCase(0, 10, CodepageName.ASCII)]
+        [TestCase(0, 100, CodepageName.UTF8)]
+        [TestCase(10, 100, CodepageName.UTF8)]
+        public void DetectFromByteArray(int offset, int len, string detectedCodepage)
+        {
+            // Arrange
+            string s = "UTF-Unknown은 파일, 스트림, 그 외 바이트 배열의 캐릭터 셋을 탐지하는 라이브러리입니다." + 
+                "대한민국 (大韓民國, Republic of Korea)";
+            byte[] bytes = Encoding.UTF8.GetBytes(s);
+
+            // Act
+            var result = CharsetDetector.DetectFromBytes(bytes, offset, len);
+
+            // Assert
+            Assert.AreEqual(detectedCodepage, result.Detected.EncodingName);
+            Assert.AreEqual(1.0f, result.Detected.Confidence);
+        }
+
         [Test]
         [TestCase(new byte[] { 0x2B, 0x2F, 0x76, 0x38 })]
         [TestCase(new byte[] { 0x2B, 0x2F, 0x76, 0x39 })]