From 961d3e11a9e56d88f7b18f5c2b8f9e07b75de4ac Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Mon, 6 Jan 2025 12:23:38 -0700 Subject: [PATCH 01/13] Use DecoderFallback.ExceptionFallback to match Java behavior, #1076 --- .gitignore | 3 +- .../Synonym/FSTSynonymFilterFactory.cs | 5 +- .../Analysis/Util/AbstractAnalysisFactory.cs | 4 +- .../JapaneseTokenizerFactory.cs | 3 +- .../Tools/ConnectionCostsBuilder.cs | 4 +- .../Tools/TokenInfoDictionaryBuilder.cs | 3 +- .../Tools/UnknownDictionaryBuilder.cs | 3 +- .../Util/LineFileDocs.cs | 6 ++- src/Lucene.Net/Index/Term.cs | 6 +-- .../Support/Text/EncodingExtensions.cs | 51 +++++++++++++++++++ src/Lucene.Net/Util/IOUtils.cs | 4 +- 11 files changed, 78 insertions(+), 14 deletions(-) create mode 100644 src/Lucene.Net/Support/Text/EncodingExtensions.cs diff --git a/.gitignore b/.gitignore index 7446dd01a6..6f8520af1c 100644 --- a/.gitignore +++ b/.gitignore @@ -65,4 +65,5 @@ websites/apidocs/api/**/*.manifest svn-*/ # vscode files -.vscode/ \ No newline at end of file +.vscode/ +.idea/**/misc.xml diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs index 41303cb1b9..c862dce167 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/FSTSynonymFilterFactory.cs @@ -1,6 +1,7 @@ // Lucene version compatibility level 4.8.1 using Lucene.Net.Analysis.Core; using Lucene.Net.Analysis.Util; +using Lucene.Net.Support.Text; using Lucene.Net.Util; using System; using System.Collections.Generic; @@ -117,7 +118,7 @@ public void Inform(IResourceLoader loader) /// private SynonymMap LoadSynonyms(IResourceLoader loader, string cname, bool dedup, Analyzer analyzer) { - Encoding decoder = Encoding.UTF8; + Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback(); SynonymMap.Parser parser; Type clazz = loader.FindType(cname /*, typeof(SynonymMap.Parser) */); @@ -165,4 +166,4 @@ private TokenizerFactory LoadTokenizerFactory(IResourceLoader loader, string cna } } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs index 9839b027d4..874c020fd0 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/AbstractAnalysisFactory.cs @@ -1,6 +1,7 @@ // Lucene version compatibility level 4.8.1 using Lucene.Net.Analysis.Core; using Lucene.Net.Support; +using Lucene.Net.Support.Text; using Lucene.Net.Util; using System; using System.Collections.Generic; @@ -385,8 +386,9 @@ protected CharArraySet GetSnowballWordSet(IResourceLoader loader, string wordFil words = new CharArraySet(m_luceneMatchVersion, files.Count * 10, ignoreCase); foreach (string file in files) { + Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback(); using (Stream stream = loader.OpenResource(file.Trim())) - using (TextReader reader = new StreamReader(stream, Encoding.UTF8)) + using (TextReader reader = new StreamReader(stream, decoder)) { WordlistLoader.GetSnowballWordSet(reader, words); } diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs index 5595375e4a..6e6d402d1a 100644 --- a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs +++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs @@ -1,5 +1,6 @@ using Lucene.Net.Analysis.Ja.Dict; using Lucene.Net.Analysis.Util; +using Lucene.Net.Support.Text; using Lucene.Net.Util; using System; using System.Collections.Generic; @@ -88,7 +89,7 @@ public virtual void Inform(IResourceLoader loader) { encoding = Encoding.UTF8.WebName; } - Encoding decoder = Encoding.GetEncoding(encoding); + Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback(); TextReader reader = new StreamReader(stream, decoder); userDictionary = new UserDictionary(reader); } diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs index 5d5f1d4c7d..ef8b482f15 100644 --- a/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs +++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs @@ -1,5 +1,6 @@ using J2N.Text; using Lucene.Net.Diagnostics; +using Lucene.Net.Support.Text; using System.Globalization; using System.IO; using System.Text; @@ -31,7 +32,8 @@ public static class ConnectionCostsBuilder // LUCENENET specific: CA1052 Static public static ConnectionCostsWriter Build(string filename) { using Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read); - using StreamReader streamReader = new StreamReader(inputStream, Encoding.ASCII, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement + Encoding decoder = Encoding.ASCII.WithDecoderExceptionFallback(); + using StreamReader streamReader = new StreamReader(inputStream, decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement string line = streamReader.ReadLine(); string[] dimensions = whiteSpaceRegex.Split(line).TrimEnd(); diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs index 7371829619..7e0df4ad7f 100644 --- a/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs +++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs @@ -1,5 +1,6 @@ using J2N.Text; using Lucene.Net.Support; +using Lucene.Net.Support.Text; using Lucene.Net.Util; using Lucene.Net.Util.Fst; using Lucene.Net.Util.Packed; @@ -71,7 +72,7 @@ public virtual TokenInfoDictionaryWriter BuildDictionary(IList csvFiles) foreach (string file in csvFiles) { using Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read); - Encoding decoder = Encoding.GetEncoding(encoding); + Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback(); using TextReader reader = new StreamReader(inputStream, decoder, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true); // LUCENENET: CA2000: Use using statement string line = null; diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs index 3fde184c0c..b2023e4625 100644 --- a/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs +++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs @@ -1,5 +1,6 @@ using J2N.Text; using Lucene.Net.Analysis.Ja.Dict; +using Lucene.Net.Support.Text; using System; using System.Collections.Generic; using System.Globalization; @@ -55,7 +56,7 @@ public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename, strin UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024); JCG.List lines = new JCG.List(); - Encoding decoder = Encoding.GetEncoding(encoding); + Encoding decoder = Encoding.GetEncoding(encoding).WithDecoderExceptionFallback(); using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read)) using (TextReader reader = new StreamReader(inputStream, decoder)) { diff --git a/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs b/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs index 991cdb6289..168d0e7467 100644 --- a/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs +++ b/src/Lucene.Net.TestFramework/Util/LineFileDocs.cs @@ -2,6 +2,7 @@ using J2N.Threading.Atomic; using Lucene.Net.Documents; using Lucene.Net.Support.IO; +using Lucene.Net.Support.Text; using Lucene.Net.Support.Threading; using RandomizedTesting.Generators; using System; @@ -236,7 +237,8 @@ private void Open(Random random) } while (b >= 0 && b != 13 && b != 10); } - reader = new StreamReader(@is, Encoding.UTF8, detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE); + Encoding decoder = Encoding.UTF8.WithDecoderExceptionFallback(); + reader = new StreamReader(@is, decoder, detectEncodingFromByteOrderMarks: false, bufferSize: BUFFER_SIZE); if (seekTo > 0L) { @@ -399,4 +401,4 @@ internal static string MaybeCreateTempFile(bool removeAfterClass = true) return result; } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net/Index/Term.cs b/src/Lucene.Net/Index/Term.cs index 38eda37a33..2de523a20a 100644 --- a/src/Lucene.Net/Index/Term.cs +++ b/src/Lucene.Net/Index/Term.cs @@ -1,5 +1,6 @@ using J2N.Text; using Lucene.Net.Support; +using Lucene.Net.Support.Text; using System; using System.Text; @@ -91,13 +92,12 @@ public Term(string fld) public static string ToString(BytesRef termText) { // the term might not be text, but usually is. so we make a best effort - // LUCENENET TODO: determine if we should use DecoderFallback.ExceptionFallback here - Encoding decoder = StandardCharsets.UTF_8; + Encoding decoder = StandardCharsets.UTF_8.WithDecoderExceptionFallback(); try { return decoder.GetString(termText.Bytes, termText.Offset, termText.Length); } - catch + catch (DecoderFallbackException) { return termText.ToString(); } diff --git a/src/Lucene.Net/Support/Text/EncodingExtensions.cs b/src/Lucene.Net/Support/Text/EncodingExtensions.cs new file mode 100644 index 0000000000..4e3cd5af4b --- /dev/null +++ b/src/Lucene.Net/Support/Text/EncodingExtensions.cs @@ -0,0 +1,51 @@ +using System.Text; + +namespace Lucene.Net.Support.Text +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Extension methods for . + /// + internal static class EncodingExtensions + { + /// + /// Returns a new instance with the set to throw + /// an exception when an invalid byte sequence is encountered. + /// + /// This is equivalent to Java's CodingErrorAction.REPORT for both onMalformedInput and + /// onUnmappableCharacter and will throw a when failing + /// to decode a string. This exception is equivalent to Java's CharacterCodingException, which is + /// a base exception type for both MalformedInputException and UnmappableCharacterException. + /// Thus, to translate Java code that catches any of those exceptions, you can catch + /// . + /// + /// The encoding to set the fallbacks on. + /// A new instance with the fallbacks set to throw an exception. + /// + /// Note that it is necessary to return a new, cloned instance because + /// the property is read-only without cloning. + /// + public static Encoding WithDecoderExceptionFallback(this Encoding encoding) + { + Encoding newEncoding = (Encoding)encoding.Clone(); + newEncoding.DecoderFallback = DecoderFallback.ExceptionFallback; + return newEncoding; + } + } +} diff --git a/src/Lucene.Net/Util/IOUtils.cs b/src/Lucene.Net/Util/IOUtils.cs index c3141b00da..624336ecf9 100644 --- a/src/Lucene.Net/Util/IOUtils.cs +++ b/src/Lucene.Net/Util/IOUtils.cs @@ -2,6 +2,7 @@ using Lucene.Net.Diagnostics; using Lucene.Net.Support; using Lucene.Net.Support.IO; +using Lucene.Net.Support.Text; using System; using System.Collections.Generic; using System.Diagnostics; @@ -378,7 +379,8 @@ private static void AddSuppressed(Exception exception, Exception suppressed) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static TextReader GetDecodingReader(Stream stream, Encoding charSet) { - return new StreamReader(stream, charSet); + var charSetDecoder = charSet.WithDecoderExceptionFallback(); + return new StreamReader(stream, charSetDecoder); } /// From ba468c13673e3a4377d5d60f1138c8e008c06005 Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Mon, 6 Jan 2025 12:31:15 -0700 Subject: [PATCH 02/13] Add unit test for WithDecoderExceptionFallback --- .../Support/Text/TestEncodingExtensions.cs | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs diff --git a/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs b/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs new file mode 100644 index 0000000000..6f6a266bfe --- /dev/null +++ b/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs @@ -0,0 +1,43 @@ +using Lucene.Net.Attributes; +using Lucene.Net.Support.Text; +using Lucene.Net.Util; +using NUnit.Framework; +using System.Text; + +namespace Lucene.Net.Text +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + [TestFixture] + public class TestEncodingExtensions : LuceneTestCase + { + [Test, LuceneNetSpecific] + public void TestWithDecoderExceptionFallback() + { + Encoding encoding = Encoding.UTF8; + Encoding newEncoding = encoding.WithDecoderExceptionFallback(); + Assert.AreNotSame(encoding, newEncoding); + Assert.AreEqual(DecoderFallback.ExceptionFallback, newEncoding.DecoderFallback); + + Assert.Throws(() => + { + _ = newEncoding.GetString(new byte[] { 0xF0 }); + }); + } + } +} From fa7367667a2daba6cba00d3df535de8d8a358bda Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Mon, 6 Jan 2025 16:50:22 -0700 Subject: [PATCH 03/13] Fix unit test namespace and doc comment --- src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs | 3 +-- src/Lucene.Net/Support/Text/EncodingExtensions.cs | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs b/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs index 6f6a266bfe..55123917e2 100644 --- a/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs +++ b/src/Lucene.Net.Tests/Support/Text/TestEncodingExtensions.cs @@ -1,10 +1,9 @@ using Lucene.Net.Attributes; -using Lucene.Net.Support.Text; using Lucene.Net.Util; using NUnit.Framework; using System.Text; -namespace Lucene.Net.Text +namespace Lucene.Net.Support.Text { /* * Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/src/Lucene.Net/Support/Text/EncodingExtensions.cs b/src/Lucene.Net/Support/Text/EncodingExtensions.cs index 4e3cd5af4b..320da35e67 100644 --- a/src/Lucene.Net/Support/Text/EncodingExtensions.cs +++ b/src/Lucene.Net/Support/Text/EncodingExtensions.cs @@ -35,8 +35,8 @@ internal static class EncodingExtensions /// Thus, to translate Java code that catches any of those exceptions, you can catch /// . /// - /// The encoding to set the fallbacks on. - /// A new instance with the fallbacks set to throw an exception. + /// The encoding to clone and set the fallback on. + /// A new instance with the fallback set to throw an exception. /// /// Note that it is necessary to return a new, cloned instance because /// the property is read-only without cloning. From f7a9186a210b1083761db6b2fc992794a956bed8 Mon Sep 17 00:00:00 2001 From: Shad Storhaug Date: Thu, 9 Jan 2025 01:46:42 +0700 Subject: [PATCH 04/13] Lucene.Net.Support.Buffers: Added ArrayPoolExtensions class to simplify returning arrays that might be null --- .../Support/Buffers/ArrayPoolExtensions.cs | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 src/Lucene.Net/Support/Buffers/ArrayPoolExtensions.cs diff --git a/src/Lucene.Net/Support/Buffers/ArrayPoolExtensions.cs b/src/Lucene.Net/Support/Buffers/ArrayPoolExtensions.cs new file mode 100644 index 0000000000..baad585409 --- /dev/null +++ b/src/Lucene.Net/Support/Buffers/ArrayPoolExtensions.cs @@ -0,0 +1,43 @@ +using System.Buffers; +using System.Runtime.CompilerServices; +#nullable enable + +namespace Lucene.Net.Support.Buffers +{ + /// + /// Extensions to + /// + internal static class ArrayPoolExtensions + { + /// + /// Returns to the pool an array that was previously obtained via on the same + /// instance. This method is a no-op if is null. + /// + /// This . + /// + /// The buffer previously obtained from to return to the pool. If null, + /// no operation will take place. + /// + /// + /// If true and if the pool will store the buffer to enable subsequent reuse, + /// will clear of its contents so that a subsequent consumer via + /// will not see the previous consumer's content. If false or if the pool will release the buffer, + /// the array's contents are left unchanged. + /// + /// + /// Once a buffer has been returned to the pool, the caller gives up all ownership of the buffer + /// and must not use it. The reference returned from a given call to must only be + /// returned via once. The default + /// may hold onto the returned buffer in order to rent it again, or it may release the returned buffer + /// if it's determined that the pool already has enough buffers stored. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void ReturnIfNotNull(this ArrayPool pool, T[]? array, bool clearArray = false) + { + if (array != null) + { + pool.Return(array, clearArray); + } + } + } +} From fcc412e370f3975f386b9d74710348bd7b736103 Mon Sep 17 00:00:00 2001 From: Shad Storhaug Date: Thu, 9 Jan 2025 02:09:39 +0700 Subject: [PATCH 05/13] Lucene.Net.Index.Term::ToString(): Optimized writing UTF8 string on target frameworks that support System.Text.Unicode.Utf8. Added tests to verify fallback is working. --- Directory.Build.targets | 7 +++ src/Lucene.Net.Tests/Index/TestTerm.cs | 60 ++++++++++++++++++- .../Support/TestApiConsistency.cs | 2 +- src/Lucene.Net/Index/Term.cs | 46 ++++++++++++++ 4 files changed, 113 insertions(+), 2 deletions(-) diff --git a/Directory.Build.targets b/Directory.Build.targets index fd71ab0554..1dc7daa031 100644 --- a/Directory.Build.targets +++ b/Directory.Build.targets @@ -37,6 +37,13 @@ + + + + $(DefineConstants);FEATURE_UTF8_TOUTF16 + + + diff --git a/src/Lucene.Net.Tests/Index/TestTerm.cs b/src/Lucene.Net.Tests/Index/TestTerm.cs index 425670dcd6..577a3781e7 100644 --- a/src/Lucene.Net.Tests/Index/TestTerm.cs +++ b/src/Lucene.Net.Tests/Index/TestTerm.cs @@ -1,3 +1,5 @@ +using Lucene.Net.Attributes; +using Lucene.Net.Util; using NUnit.Framework; using Assert = Lucene.Net.TestFramework.Assert; @@ -39,5 +41,61 @@ public virtual void TestEquals() Assert.IsFalse(@base.Equals(differentText)); Assert.IsFalse(@base.Equals(differentType)); } + + [Test, LuceneNetSpecific] + public void TestToString_ValidUtf8Data() + { + // Arrange + var validUtf8 = new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F }; // "Hello" + var bytesRef = new BytesRef(validUtf8, 0, validUtf8.Length); + + // Act + string result = Term.ToString(bytesRef); + + // Assert + Assert.AreEqual("Hello", result); + } + + [Test, LuceneNetSpecific] + public void TestToString_InvalidUtf8Data() + { + // Arrange + var invalidUtf8 = new byte[] { 0xC3, 0x28 }; // Invalid UTF-8 sequence + var bytesRef = new BytesRef(invalidUtf8, 0, invalidUtf8.Length); + + // Act + string result = Term.ToString(bytesRef); + + // Assert + Assert.AreEqual("[c3 28]", result); // Should match BytesRef.ToString() + } + + [Test, LuceneNetSpecific] + public void TestToString_Utf8WithBom() + { + // Arrange + var utf8WithBom = new byte[] { 0xEF, 0xBB, 0xBF, 0x48, 0x69 }; // BOM + "Hi" + var bytesRef = new BytesRef(utf8WithBom, 0, utf8WithBom.Length); + + // Act + string result = Term.ToString(bytesRef); + + // Assert + Assert.AreEqual("\uFEFFHi", result); // BOM is preserved in the string + } + + [Test, LuceneNetSpecific] + public void TestToString_Utf8WithoutBom() + { + // Arrange + var utf8WithoutBom = new byte[] { 0x48, 0x69 }; // "Hi" + var bytesRef = new BytesRef(utf8WithoutBom, 0, utf8WithoutBom.Length); + + // Act + string result = Term.ToString(bytesRef); + + // Assert + Assert.AreEqual("Hi", result); + } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests/Support/TestApiConsistency.cs b/src/Lucene.Net.Tests/Support/TestApiConsistency.cs index f565676ac7..04b96b91bc 100644 --- a/src/Lucene.Net.Tests/Support/TestApiConsistency.cs +++ b/src/Lucene.Net.Tests/Support/TestApiConsistency.cs @@ -38,7 +38,7 @@ public override void TestProtectedFieldNames(Type typeFromTargetAssembly) [TestCase(typeof(Lucene.Net.Analysis.Analyzer))] public override void TestPrivateFieldNames(Type typeFromTargetAssembly) { - base.TestPrivateFieldNames(typeFromTargetAssembly, @"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)"); + base.TestPrivateFieldNames(typeFromTargetAssembly, @"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)|CharStackBufferSize$"); } [Test, LuceneNetSpecific] diff --git a/src/Lucene.Net/Index/Term.cs b/src/Lucene.Net/Index/Term.cs index 2de523a20a..6930fa5430 100644 --- a/src/Lucene.Net/Index/Term.cs +++ b/src/Lucene.Net/Index/Term.cs @@ -1,7 +1,9 @@ using J2N.Text; using Lucene.Net.Support; +using Lucene.Net.Support.Buffers; using Lucene.Net.Support.Text; using System; +using System.Buffers; using System.Text; namespace Lucene.Net.Index @@ -35,6 +37,8 @@ namespace Lucene.Net.Index /// public sealed class Term : IComparable, IEquatable // LUCENENET specific - class implements IEquatable { + private const int CharStackBufferSize = 64; + /// /// Constructs a with the given field and bytes. /// Note that a null field or null bytes value results in undefined @@ -85,12 +89,52 @@ public Term(string fld) /// public string Text => ToString(Bytes); // LUCENENET: Changed to a property. While this calls a method internally, its expected usage is that it will return a deterministic value. +#nullable enable /// /// Returns human-readable form of the term text. If the term is not unicode, /// the raw bytes will be printed instead. /// public static string ToString(BytesRef termText) { + if (termText is null) + throw new ArgumentNullException(nameof(termText)); // LUCENENET: Added guard clause +#if FEATURE_UTF8_TOUTF16 + // View the relevant portion of the byte array + ReadOnlySpan utf8Span = new ReadOnlySpan(termText.Bytes, termText.Offset, termText.Length); + + // Allocate a buffer for the maximum possible UTF-16 output + int maxChars = utf8Span.Length; // Worst case: 1 byte -> 1 char (ASCII) + char[]? arrayToReturnToPool = null; + + Span charBuffer = maxChars > CharStackBufferSize + ? (arrayToReturnToPool = ArrayPool.Shared.Rent(maxChars)) + : stackalloc char[CharStackBufferSize]; + try + { + // Decode the UTF-8 bytes to UTF-16 chars + OperationStatus status = System.Text.Unicode.Utf8.ToUtf16( + utf8Span, + charBuffer, + out int bytesConsumed, + out int charsWritten, + replaceInvalidSequences: false); // Causes OperationStatus.InvalidData to occur rather than replace + + // NOTE: We handle OperationStatus.InvalidData below in the fallback path. + if (status == OperationStatus.Done) + { + // Successfully decoded the UTF-8 input + return charBuffer.Slice(0, charsWritten).ToString(); + } + } + finally + { + // Return the buffer to the pool + ArrayPool.Shared.ReturnIfNotNull(arrayToReturnToPool); + } + + // Fallback to the default string representation if decoding fails + return termText.ToString(); +#else // the term might not be text, but usually is. so we make a best effort Encoding decoder = StandardCharsets.UTF_8.WithDecoderExceptionFallback(); try @@ -101,7 +145,9 @@ public static string ToString(BytesRef termText) { return termText.ToString(); } +#endif } +#nullable restore /// /// Returns the bytes of this term. From 955872b72981ef89e0d9a16a5029bb6176613218 Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Fri, 10 Jan 2025 20:16:19 -0700 Subject: [PATCH 06/13] Cache decoder fallback encoding lookup, #1076 --- .../Support/Text/EncodingExtensions.cs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/Lucene.Net/Support/Text/EncodingExtensions.cs b/src/Lucene.Net/Support/Text/EncodingExtensions.cs index 320da35e67..5e1c3574cd 100644 --- a/src/Lucene.Net/Support/Text/EncodingExtensions.cs +++ b/src/Lucene.Net/Support/Text/EncodingExtensions.cs @@ -1,4 +1,6 @@ +using System.Collections.Concurrent; using System.Text; +#nullable enable namespace Lucene.Net.Support.Text { @@ -24,8 +26,10 @@ namespace Lucene.Net.Support.Text /// internal static class EncodingExtensions { + private static readonly ConcurrentDictionary decoderExceptionFallbackCache = new(); + /// - /// Returns a new instance with the set to throw + /// Returns an instance with the set to throw /// an exception when an invalid byte sequence is encountered. /// /// This is equivalent to Java's CodingErrorAction.REPORT for both onMalformedInput and @@ -36,16 +40,19 @@ internal static class EncodingExtensions /// . /// /// The encoding to clone and set the fallback on. - /// A new instance with the fallback set to throw an exception. + /// An instance with the fallback set to throw an exception. /// - /// Note that it is necessary to return a new, cloned instance because + /// Note that it is necessary to clone the instance because /// the property is read-only without cloning. /// public static Encoding WithDecoderExceptionFallback(this Encoding encoding) { - Encoding newEncoding = (Encoding)encoding.Clone(); - newEncoding.DecoderFallback = DecoderFallback.ExceptionFallback; - return newEncoding; + return decoderExceptionFallbackCache.GetOrAdd(encoding, static e => + { + Encoding newEncoding = (Encoding)e.Clone(); + newEncoding.DecoderFallback = DecoderFallback.ExceptionFallback; + return newEncoding; + }); } } } From b61a322d7512a44529581098a0938e9e09ba2380 Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Fri, 10 Jan 2025 20:23:28 -0700 Subject: [PATCH 07/13] Treat Encoder/DecoderFallbackExceptions as IOExceptions to match Java, #1076 --- .../ExceptionScanningTestCase.cs | 7 +++---- .../ExceptionHandling/ExceptionExtensions.cs | 16 +++++++++++----- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs index 8fa941f429..66de7a41e5 100644 --- a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs +++ b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs @@ -13,6 +13,7 @@ using System.Reflection; using System.Resources; using System.Security; +using System.Text; using Assert = Lucene.Net.TestFramework.Assert; namespace Lucene.Net.Support.ExceptionHandling @@ -184,6 +185,8 @@ private static IEnumerable LoadKnownErrorExceptionTypes() typeof(UnauthorizedAccessException), typeof(ObjectDisposedException), typeof(Lucene.AlreadyClosedException), + typeof(EncoderFallbackException), // In Java, CharacterCodingException subclasses IOException + typeof(DecoderFallbackException), }.Union(AllIOExceptionTypes) // .NET Framework only - Subclasses UnauthorizedAccessException .Union(new[] { PrivilegeNotHeldExceptionType }); @@ -221,8 +224,6 @@ private static IEnumerable LoadKnownErrorExceptionTypes() // Subclasses typeof(System.DuplicateWaitObjectException), typeof(System.Globalization.CultureNotFoundException), - typeof(System.Text.DecoderFallbackException), - typeof(System.Text.EncoderFallbackException), }; public static readonly IEnumerable KnownIllegalArgumentExceptionTypes_TestEnvironment = new Type[] { @@ -234,8 +235,6 @@ private static IEnumerable LoadKnownErrorExceptionTypes() // Subclasses typeof(System.DuplicateWaitObjectException), typeof(System.Globalization.CultureNotFoundException), - typeof(System.Text.DecoderFallbackException), - typeof(System.Text.EncoderFallbackException), }; public static readonly IEnumerable KnownRuntimeExceptionTypes = LoadKnownRuntimeExceptionTypes(); diff --git a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs index f7b9c1e512..09ba5bdf78 100644 --- a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs +++ b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs @@ -6,6 +6,7 @@ using System.Resources; using System.Runtime.CompilerServices; using System.Security; +using System.Text; using System.Threading; namespace Lucene @@ -213,8 +214,11 @@ public static bool IsIOException(this Exception e) if (e is null || e.IsAlwaysIgnored()) return false; return e is IOException || - e.IsAlreadyClosedException() || // In Lucene, AlreadyClosedException subclass IOException instead of InvalidOperationException, so we need a special case here - e is UnauthorizedAccessException; // In Java, java.nio.file.AccessDeniedException subclasses IOException + e.IsAlreadyClosedException() || // In Lucene, AlreadyClosedException subclass IOException instead of InvalidOperationException, so we need a special case here + e is + UnauthorizedAccessException // In Java, java.nio.file.AccessDeniedException subclasses IOException + or DecoderFallbackException // In Java, CharacterCodingException subclasses IOException + or EncoderFallbackException; } /// @@ -368,9 +372,11 @@ public static bool IsIllegalArgumentException(this Exception e) // LUCENENET: In production, there is a chance that we will upgrade to ArgumentNullExcpetion or ArgumentOutOfRangeException // and it is still important that those are caught. However, we have a copy of this method in the test environment // where this is done more strictly to catch ArgumentException without its known subclasses so we can be more explicit in tests. - return e is ArgumentException; - //!(e is ArgumentNullException) && // Corresponds to NullPointerException, so we don't catch it here. - //!(e is ArgumentOutOfRangeException); // Corresponds to IndexOutOfBoundsException (and subclasses), so we don't catch it here. + return e is ArgumentException + and not DecoderFallbackException // In Java, CharacterCodingException subclasses IOException, not ArgumentException + and not EncoderFallbackException; + //!(e is ArgumentNullException) && // Corresponds to NullPointerException, so we don't catch it here. + //!(e is ArgumentOutOfRangeException); // Corresponds to IndexOutOfBoundsException (and subclasses), so we don't catch it here. } /// From d2c4b52a5d056ea651f06bdd87d1b159fd8305ab Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Fri, 10 Jan 2025 20:33:54 -0700 Subject: [PATCH 08/13] Fix translation of replacement fallback test code, IOException/RuntimeException tests --- .../ExceptionScanningTestCase.cs | 2 -- .../TestExceptionExtensions.cs | 27 +++++++++---------- .../WriterCache/TestCharBlockArray.cs | 22 +++------------ .../WriterCache/TestCompactLabelToOrdinal.cs | 18 +++---------- 4 files changed, 21 insertions(+), 48 deletions(-) diff --git a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs index 66de7a41e5..8cd3942d66 100644 --- a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs +++ b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/ExceptionScanningTestCase.cs @@ -366,8 +366,6 @@ private static IEnumerable LoadKnownRuntimeExceptionTypes() typeof(System.Runtime.Serialization.SerializationException), typeof(System.Security.Cryptography.CryptographicException), typeof(System.Security.VerificationException), - typeof(System.Text.DecoderFallbackException), // LUCENENET TODO: Need to be sure about this one - typeof(System.Text.EncoderFallbackException), // LUCENENET TODO: Need to be sure about this one typeof(System.Threading.AbandonedMutexException), typeof(System.Threading.SemaphoreFullException), typeof(System.Threading.SynchronizationLockException), diff --git a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/TestExceptionExtensions.cs b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/TestExceptionExtensions.cs index 1960c482b3..650871b236 100644 --- a/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/TestExceptionExtensions.cs +++ b/src/Lucene.Net.Tests.AllProjects/Support/ExceptionHandling/TestExceptionExtensions.cs @@ -1,5 +1,4 @@ -using J2N.Text; -using Lucene.Net.Attributes; +using Lucene.Net.Attributes; using NUnit.Framework; using System; using System.Collections.Generic; @@ -195,7 +194,7 @@ private static void ThrowException(Type exceptionType) } [Test] - [TestCaseSource("ThrowableTypeExpressions")] + [TestCaseSource(nameof(ThrowableTypeExpressions))] public void TestIsThrowable(Type exceptionType, bool expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit display them all { static bool extensionMethod(Exception e) => e.IsThrowable(); @@ -211,7 +210,7 @@ public void TestIsThrowable(Type exceptionType, bool expectedToThrow, Action exp } [Test] - [TestCaseSource("ErrorTypeExpressions")] + [TestCaseSource(nameof(ErrorTypeExpressions))] public void TestIsError(Type exceptionType, bool expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit display them all { static bool extensionMethod(Exception e) => e.IsError(); @@ -229,7 +228,7 @@ public void TestIsError(Type exceptionType, bool expectedToThrow, Action express // This test ensures that all known Error types from Java are not caught by // our IsException() handler. [Test] - [TestCaseSource("ExceptionTypeExpressions")] + [TestCaseSource(nameof(ExceptionTypeExpressions))] public void TestIsException(Type exceptionType, bool expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit display them all { static bool extensionMethod(Exception e) => e.IsException(); @@ -247,7 +246,7 @@ public void TestIsException(Type exceptionType, bool expectedToThrow, Action exp // This test ensures that all known Error types from Java are not caught by // our IsRuntimeException() handler. [Test] - [TestCaseSource("RuntimeExceptionTypeExpressions")] + [TestCaseSource(nameof(RuntimeExceptionTypeExpressions))] public void TestIsRuntimeException(Type exceptionType, bool expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit display them all { static bool extensionMethod(Exception e) => e.IsRuntimeException(); @@ -263,7 +262,7 @@ public void TestIsRuntimeException(Type exceptionType, bool expectedToThrow, Act } [Test] - [TestCaseSource("IOExceptionTypeExpressions")] + [TestCaseSource(nameof(IOExceptionTypeExpressions))] public void TestIsIOException(Type exceptionType, bool expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit display them all { static bool extensionMethod(Exception e) => e.IsIOException(); @@ -282,7 +281,7 @@ public void TestIsIOException(Type exceptionType, bool expectedToThrow, Action e // NUnit's AssertionException and MultipleAssertException types are all treated as if they were AssertionError // in Java. [Test] - [TestCaseSource("AssertionErrorTypeExpressions")] + [TestCaseSource(nameof(AssertionErrorTypeExpressions))] public void TestIsAssertionError(Type exceptionType, bool expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit display them all { static bool extensionMethod(Exception e) => e.IsAssertionError(); @@ -302,7 +301,7 @@ public void TestIsAssertionError(Type exceptionType, bool expectedToThrow, Actio // Java has 2 other types ArrayIndexOutOfBoundsException and StringIndexOutOfBoundsException, whose alias // exception types are also part of the test. [Test] - [TestCaseSource("IndexOutOfBoundsExceptionTypeExpressions")] + [TestCaseSource(nameof(IndexOutOfBoundsExceptionTypeExpressions))] public void TestIsIndexOutOfBoundsException(Type exceptionType, bool expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit display them all { static bool extensionMethod(Exception e) => e.IsIndexOutOfBoundsException(); @@ -320,7 +319,7 @@ public void TestIsIndexOutOfBoundsException(Type exceptionType, bool expectedToT // This test ensures that ArgumentNullException and NullReferenceException are both caught by our // NullPointerException handler, because they both correspond to NullPointerException in Java [Test] - [TestCaseSource("NullPointerExceptionTypeExpressions")] + [TestCaseSource(nameof(NullPointerExceptionTypeExpressions))] public void TestIsNullPointerException(Type exceptionType, bool expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit display them all { static bool extensionMethod(Exception e) => e.IsNullPointerException(); @@ -339,7 +338,7 @@ public void TestIsNullPointerException(Type exceptionType, bool expectedToThrow, // We do it this way in production to ensure that if we "upgrade" to a .NET // ArgumentNullException or ArgumentOutOfRangeException it won't break the code. [Test] - [TestCaseSource("IllegalArgumentExceptionTypeExpressions")] + [TestCaseSource(nameof(IllegalArgumentExceptionTypeExpressions))] public void TestIsIllegalArgumentException(Type exceptionType, bool expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit display them all { // Make sure we are testing the production code @@ -360,7 +359,7 @@ public void TestIsIllegalArgumentException(Type exceptionType, bool expectedToTh // in the test environment to ensure that if a test is specified wrong it will fail and should be updated // and commented to indicate we diverged from Lucene. [Test] - [TestCaseSource("IllegalArgumentExceptionTypeExpressions_TestEnvironment")] + [TestCaseSource(nameof(IllegalArgumentExceptionTypeExpressions_TestEnvironment))] public void TestIsIllegalArgumentException_TestEnvironment(Type exceptionType, bool expectedToThrow, Action expression) // LUCENENET NOTE: exceptionType is only here to make NUnit display them all { // Make sure we are testing the test environment code @@ -376,7 +375,7 @@ public void TestIsIllegalArgumentException_TestEnvironment(Type exceptionType, b } } - private void AssertCatches(Action action, Func extensionMethodExpression) + private static void AssertCatches(Action action, Func extensionMethodExpression) { try { @@ -397,7 +396,7 @@ private void AssertCatches(Action action, Func extensionMethodE } } - private void AssertDoesNotCatch(Action action, Func extensionMethodExpression) + private static void AssertDoesNotCatch(Action action, Func extensionMethodExpression) { try { diff --git a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs index ecf2d317bb..b658a3cb56 100644 --- a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs +++ b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs @@ -2,6 +2,7 @@ using J2N.IO; using J2N.Text; using Lucene.Net.Attributes; +using Lucene.Net.Support; using NUnit.Framework; using System; using System.IO; @@ -40,24 +41,13 @@ public virtual void TestArray() byte[] buffer = new byte[50]; - // This is essentially the equivalent of - // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder() - // .onUnmappableCharacter(CodingErrorAction.REPLACE) - // .onMalformedInput(CodingErrorAction.REPLACE); - // - // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, - // new EncoderReplacementFallback("?"), - // new DecoderReplacementFallback("?")); - for (int i = 0; i < n; i++) { Random.NextBytes(buffer); int size = 1 + Random.Next(50); // This test is turning random bytes into a string, // this is asking for trouble. - Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, - new EncoderReplacementFallback("?"), - new DecoderReplacementFallback("?")); + Encoding decoder = StandardCharsets.UTF_8; // LUCENENET specific: no need to set decoder fallback, because it already replaces by default string s = decoder.GetString(buffer, 0, size); array.Append(s); builder.Append(s); @@ -69,9 +59,7 @@ public virtual void TestArray() int size = 1 + Random.Next(50); // This test is turning random bytes into a string, // this is asking for trouble. - Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, - new EncoderReplacementFallback("?"), - new DecoderReplacementFallback("?")); + Encoding decoder = StandardCharsets.UTF_8; // LUCENENET specific: no need to set decoder fallback, because it already replaces by default string s = decoder.GetString(buffer, 0, size); array.Append(s); builder.Append(s); @@ -83,9 +71,7 @@ public virtual void TestArray() int size = 1 + Random.Next(50); // This test is turning random bytes into a string, // this is asking for trouble. - Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, - new EncoderReplacementFallback("?"), - new DecoderReplacementFallback("?")); + Encoding decoder = StandardCharsets.UTF_8; // LUCENENET specific: no need to set decoder fallback, because it already replaces by default string s = decoder.GetString(buffer, 0, size); for (int j = 0; j < s.Length; j++) { diff --git a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs index b8d013a6d8..a7c7861d12 100644 --- a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs +++ b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs @@ -1,5 +1,6 @@ // Lucene version compatibility level 4.8.1 using Lucene.Net.Attributes; +using Lucene.Net.Support; using NUnit.Framework; using System; using System.Collections.Generic; @@ -43,15 +44,6 @@ public virtual void TestL2O() string[] uniqueValues = new string[numUniqueValues]; byte[] buffer = new byte[50]; - // This is essentially the equivalent of - // CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder() - // .onUnmappableCharacter(CodingErrorAction.REPLACE) - // .onMalformedInput(CodingErrorAction.REPLACE); - // - // Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, - // new EncoderReplacementFallback("?"), - // new DecoderReplacementFallback("?")); - Random random = Random; for (int i = 0; i < numUniqueValues;) { @@ -60,9 +52,7 @@ public virtual void TestL2O() // This test is turning random bytes into a string, // this is asking for trouble. - Encoding decoder = Encoding.GetEncoding(Encoding.UTF8.CodePage, - new EncoderReplacementFallback("?"), - new DecoderReplacementFallback("?")); + Encoding decoder = StandardCharsets.UTF_8; // LUCENENET specific: no need to set decoder fallback, because it already replaces by default uniqueValues[i] = decoder.GetString(buffer, 0, size); // we cannot have empty path components, so eliminate all prefix as well // as middle consecutive delimiter chars. @@ -292,6 +282,6 @@ public override int GetOrdinal(FacetLabel label) } return LabelToOrdinal.INVALID_ORDINAL; } - } + } } -} \ No newline at end of file +} From 66f3e3c91fc966db11f692da9f17f4460d64d3c2 Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Fri, 10 Jan 2025 20:40:42 -0700 Subject: [PATCH 09/13] Use Encoding.Default instead of GetEncoding(0), #1076 --- .../ByTask/Feeds/ContentItemsSource.cs | 6 +++--- src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs | 2 +- src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs | 9 ++++++--- .../ByTask/Tasks/CreateIndexTaskTest.cs | 4 ++-- src/Lucene.Net.Tests.Demo/TestDemo.cs | 6 ++---- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs index 7932f749ec..4b50076cdf 100644 --- a/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs +++ b/src/Lucene.Net.Benchmark/ByTask/Feeds/ContentItemsSource.cs @@ -32,7 +32,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Feeds /// Base class for source of data for benchmarking. /// /// - /// Keeps track of various statistics, such as how many data items were generated, + /// Keeps track of various statistics, such as how many data items were generated, /// size in bytes etc. /// /// Supports the following configuration parameters: @@ -40,7 +40,7 @@ namespace Lucene.Net.Benchmarks.ByTask.Feeds /// content.source.foreverspecifies whether to generate items forever (default=true). /// content.source.verbosespecifies whether messages should be output by the content source (default=false). /// content.source.encoding - /// specifies which encoding to use when + /// specifies which encoding to use when /// reading the files of that content source. Certain implementations may define /// a default value if this parameter is not specified. (default=null). /// @@ -199,7 +199,7 @@ public virtual void SetConfig(Config config) } else { - m_encoding = Encoding.GetEncoding(0); // Default system encoding + m_encoding = Encoding.Default; // Default system encoding } } diff --git a/src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs b/src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs index 6fd4cba208..1e8ed83362 100644 --- a/src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs +++ b/src/Lucene.Net.Benchmark/ByTask/Tasks/CreateIndexTask.cs @@ -215,7 +215,7 @@ public static IndexWriter ConfigureWriter(Config config, PerfRunData runData, Op else { FileInfo f = new FileInfo(infoStreamVal); - iwc.SetInfoStream(new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.GetEncoding(0))); + iwc.SetInfoStream(new StreamWriter(new FileStream(f.FullName, FileMode.Create, FileAccess.Write), Encoding.Default)); } } IndexWriter writer = new IndexWriter(runData.Directory, iwc); diff --git a/src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs b/src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs index d08dab75a5..a223653dc9 100644 --- a/src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs +++ b/src/Lucene.Net.Benchmark/Quality/Trec/QueryDriver.cs @@ -77,10 +77,13 @@ public static void Main(string[] args) string fieldSpec = args.Length == 5 ? args[4] : "T"; // default to Title-only if not specified. IndexSearcher searcher = new IndexSearcher(reader); - int maxResults = 1000; - string docNameField = "docname"; + const int maxResults = 1000; + const string docNameField = "docname"; - TextWriter logger = Console.Out; //new StreamWriter(Console, Encoding.GetEncoding(0)); + using TextWriter logger = new StreamWriter(System.Console.OpenStandardOutput(), Encoding.Default) + { + AutoFlush = true, + }; // use trec utilities to read trec topics into quality queries TrecTopicsReader qReader = new TrecTopicsReader(); diff --git a/src/Lucene.Net.Tests.Benchmark/ByTask/Tasks/CreateIndexTaskTest.cs b/src/Lucene.Net.Tests.Benchmark/ByTask/Tasks/CreateIndexTaskTest.cs index 00f85106e7..169b781504 100644 --- a/src/Lucene.Net.Tests.Benchmark/ByTask/Tasks/CreateIndexTaskTest.cs +++ b/src/Lucene.Net.Tests.Benchmark/ByTask/Tasks/CreateIndexTaskTest.cs @@ -56,7 +56,7 @@ public void TestInfoStream_SystemOutErr() TextWriter curOut = Console.Out; ByteArrayOutputStream baos = new ByteArrayOutputStream(); - Console.Out = new StreamWriter(baos, Encoding.GetEncoding(0)); + Console.Out = new StreamWriter(baos, Encoding.Default); try { PerfRunData runData = createPerfRunData("SystemOut"); @@ -72,7 +72,7 @@ public void TestInfoStream_SystemOutErr() TextWriter curErr = Console.Error; baos = new ByteArrayOutputStream(); - Console.Error = new StreamWriter(baos, Encoding.GetEncoding(0)); + Console.Error = new StreamWriter(baos, Encoding.Default); try { PerfRunData runData = createPerfRunData("SystemErr"); diff --git a/src/Lucene.Net.Tests.Demo/TestDemo.cs b/src/Lucene.Net.Tests.Demo/TestDemo.cs index 16b2379a53..9957f7b715 100644 --- a/src/Lucene.Net.Tests.Demo/TestDemo.cs +++ b/src/Lucene.Net.Tests.Demo/TestDemo.cs @@ -33,8 +33,7 @@ private void TestOneSearch(DirectoryInfo indexPath, string query, int expectedHi try { MemoryStream bytes = new MemoryStream(); - // .NET NOTE: GetEncoding(0) returns the current system's default encoding - var fakeSystemOut = new StreamWriter(bytes, Encoding.GetEncoding(0)); + var fakeSystemOut = new StreamWriter(bytes, Encoding.Default); Console.SetOut(fakeSystemOut); // LUCENENET specific: changed the arguments to act more like the dotnet.exe commands. // * only optional arguments start with - @@ -44,8 +43,7 @@ private void TestOneSearch(DirectoryInfo indexPath, string query, int expectedHi // it consistent with the lucene-cli utility. SearchFiles.Main(new string[] { indexPath.FullName, "--query", query }); fakeSystemOut.Flush(); - // .NET NOTE: GetEncoding(0) returns the current system's default encoding - string output = Encoding.GetEncoding(0).GetString(bytes.ToArray()); // intentionally use default encoding + string output = Encoding.Default.GetString(bytes.ToArray()); // intentionally use default encoding assertTrue("output=" + output, output.Contains(expectedHitCount + " total matching documents")); } finally From c07cc8738b2100b5839fe03bf93c34920670ea7a Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Fri, 10 Jan 2025 20:50:10 -0700 Subject: [PATCH 10/13] Cache GB2312 encoding lookup, #1076 --- .../Analysis/Hunspell/Dictionary.cs | 2 +- .../Analysis/Hunspell/ISO8859_14Decoder.cs | 7 ++++++- .../Hhmm/AbstractDictionary.cs | 11 +++++++---- .../Hhmm/BigramDictionary.cs | 2 +- .../Hhmm/WordDictionary.cs | 2 +- 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs index ceeb7eb532..b790df5e67 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs @@ -746,7 +746,7 @@ private static Encoding GetSystemEncoding(string encoding) // LUCENENET: CA1822: } if ("ISO8859-14".Equals(encoding, StringComparison.OrdinalIgnoreCase)) { - return new ISO8859_14Encoding(); + return ISO8859_14Encoding.Default; } // .NET doesn't recognize the encoding without a dash between ISO and the number // https://msdn.microsoft.com/en-us/library/system.text.encodinginfo.getencoding(v=vs.110).aspx diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs index 6078954049..7b7eb59c1f 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs @@ -28,6 +28,11 @@ namespace Lucene.Net.Analysis.Hunspell [ExceptionToClassNameConvention] internal sealed class ISO8859_14Encoding : Encoding { + /// + /// The default singleton instance of the class. + /// + public static new ISO8859_14Encoding Default { get; } = new ISO8859_14Encoding(); + private static readonly Decoder decoder = new ISO8859_14Decoder(); public override Decoder GetDecoder() { @@ -119,4 +124,4 @@ public override int GetChars(byte[] bytesIn, int byteIndex, int byteCount, char[ return writeCount; } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs index 5e2139018e..1d5da6d3af 100644 --- a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs @@ -32,6 +32,9 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm /// internal abstract class AbstractDictionary { + // LUCENENET specific: cached GB2312 encoding to avoid repeated calls to Encoding.GetEncoding("GB2312") + protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312"); + /// /// First Chinese Character in GB2312 (15 * 94) /// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation. @@ -39,7 +42,7 @@ internal abstract class AbstractDictionary public const int GB2312_FIRST_CHAR = 1410; /// - /// Last Chinese Character in GB2312 (87 * 94). + /// Last Chinese Character in GB2312 (87 * 94). /// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned. /// public const int GB2312_CHAR_NUM = 87 * 94; @@ -98,7 +101,7 @@ public virtual string GetCCByGB2312Id(int ccid) try { //String cchar = new String(buffer, "GB2312"); - string cchar = Encoding.GetEncoding("GB2312").GetString(buffer); + string cchar = gb2312Encoding.GetString(buffer); // LUCENENET specific: use cached encoding instance return cchar; } catch (Exception e) when (e.IsUnsupportedEncodingException()) // Encoding is not supported by the platform @@ -117,7 +120,7 @@ public virtual short GetGB2312Id(char ch) try { //byte[] buffer = Character.ToString(ch).getBytes("GB2312"); - byte[] buffer = Encoding.GetEncoding("GB2312").GetBytes(ch.ToString()); + byte[] buffer = gb2312Encoding.GetBytes(ch.ToString()); // LUCENENET specific: use cached encoding instance //byte[] buffer = Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString()); if (buffer.Length != 2) { @@ -125,7 +128,7 @@ public virtual short GetGB2312Id(char ch) return -1; } int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161 - int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol. + int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol. // Therefore, each code page only has 16*6-2=94 characters. return (short)(b0 * 94 + b1); } diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs index b9d16273ae..da712cb0de 100644 --- a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs @@ -302,7 +302,7 @@ public virtual void LoadFromFile(string dctFilePath) byte[] lchBuffer = new byte[length]; dctFile.Read(lchBuffer, 0, lchBuffer.Length); //tmpword = new String(lchBuffer, "GB2312"); - tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer); + tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer); if (i != 3755 + GB2312_FIRST_CHAR) { diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs index b8cd7cbbfa..b6e42be522 100644 --- a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs @@ -395,7 +395,7 @@ private int LoadMainDataFromFile(string dctFilePath) { byte[] lchBuffer = new byte[length]; dctFile.Read(lchBuffer, 0, lchBuffer.Length); - tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer); + tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class wordItem_charArrayTable[i][j] = tmpword.ToCharArray(); } else From b850580f89b28e2fc303cfad4e83cd1f618e66ca Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Sat, 11 Jan 2025 11:39:20 -0700 Subject: [PATCH 11/13] Replace StandardCharsets.UTF_8 with Encoding.UTF8 in two tests, #1076 --- .../Taxonomy/WriterCache/TestCharBlockArray.cs | 6 +++--- .../Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs index b658a3cb56..0c9dab11d9 100644 --- a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs +++ b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCharBlockArray.cs @@ -47,7 +47,7 @@ public virtual void TestArray() int size = 1 + Random.Next(50); // This test is turning random bytes into a string, // this is asking for trouble. - Encoding decoder = StandardCharsets.UTF_8; // LUCENENET specific: no need to set decoder fallback, because it already replaces by default + Encoding decoder = Encoding.UTF8; // LUCENENET specific: no need to set decoder fallback, because Encoding.UTF8 already replaces by default string s = decoder.GetString(buffer, 0, size); array.Append(s); builder.Append(s); @@ -59,7 +59,7 @@ public virtual void TestArray() int size = 1 + Random.Next(50); // This test is turning random bytes into a string, // this is asking for trouble. - Encoding decoder = StandardCharsets.UTF_8; // LUCENENET specific: no need to set decoder fallback, because it already replaces by default + Encoding decoder = Encoding.UTF8; // LUCENENET specific: no need to set decoder fallback, because Encoding.UTF8 already replaces by default string s = decoder.GetString(buffer, 0, size); array.Append(s); builder.Append(s); @@ -71,7 +71,7 @@ public virtual void TestArray() int size = 1 + Random.Next(50); // This test is turning random bytes into a string, // this is asking for trouble. - Encoding decoder = StandardCharsets.UTF_8; // LUCENENET specific: no need to set decoder fallback, because it already replaces by default + Encoding decoder = Encoding.UTF8; // LUCENENET specific: no need to set decoder fallback, because Encoding.UTF8 already replaces by default string s = decoder.GetString(buffer, 0, size); for (int j = 0; j < s.Length; j++) { diff --git a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs index a7c7861d12..d38b1fd459 100644 --- a/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs +++ b/src/Lucene.Net.Tests.Facet/Taxonomy/WriterCache/TestCompactLabelToOrdinal.cs @@ -52,7 +52,7 @@ public virtual void TestL2O() // This test is turning random bytes into a string, // this is asking for trouble. - Encoding decoder = StandardCharsets.UTF_8; // LUCENENET specific: no need to set decoder fallback, because it already replaces by default + Encoding decoder = Encoding.UTF8; // LUCENENET specific: no need to set decoder fallback, because Encoding.UTF8 already replaces by default uniqueValues[i] = decoder.GetString(buffer, 0, size); // we cannot have empty path components, so eliminate all prefix as well // as middle consecutive delimiter chars. From 4b574b9f8752fabb52cd480195bffb3b44cc2595 Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Sat, 11 Jan 2025 12:00:35 -0700 Subject: [PATCH 12/13] Fix test extension method for detecting IllegalArgumentException, #1076 --- .../Support/ExceptionHandling/ExceptionExtensions.cs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs b/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs index f2d39de48e..354f596f3f 100644 --- a/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs +++ b/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs @@ -1,5 +1,6 @@ using System; using System.Runtime.CompilerServices; +using System.Text; namespace Lucene.Net { @@ -56,9 +57,11 @@ public static bool IsIllegalArgumentException(this Exception e) // If our exception implements IError and subclasses ArgumentException, we will ignore it. if (e is null || e.IsError() || e.IsAlwaysIgnored()) return false; - return e is ArgumentException && - e is not ArgumentNullException && // Corresponds to NullPointerException, so we don't catch it here. - e is not ArgumentOutOfRangeException; // Corresponds to IndexOutOfBoundsException (and subclasses), so we don't catch it here. + return e is ArgumentException + and not ArgumentNullException // Corresponds to NullPointerException, so we don't catch it here. + and not ArgumentOutOfRangeException // Corresponds to IndexOutOfBoundsException (and subclasses), so we don't catch it here. + and not DecoderFallbackException // CharacterCodingException is an IOException in Java, maps to DecoderFallbackException and EncoderFallbackException in .NET + and not EncoderFallbackException; } } } From 2d5ce91ceb01cf69fa3173e303be6579cd263274 Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Sat, 11 Jan 2025 20:26:58 -0700 Subject: [PATCH 13/13] Cascade call from IsIllegalArgumentException --- .../Support/ExceptionHandling/ExceptionExtensions.cs | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs b/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs index 354f596f3f..4248697183 100644 --- a/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs +++ b/src/Lucene.Net.TestFramework/Support/ExceptionHandling/ExceptionExtensions.cs @@ -54,14 +54,9 @@ internal static class ExceptionExtensions [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsIllegalArgumentException(this Exception e) { - // If our exception implements IError and subclasses ArgumentException, we will ignore it. - if (e is null || e.IsError() || e.IsAlwaysIgnored()) return false; - - return e is ArgumentException - and not ArgumentNullException // Corresponds to NullPointerException, so we don't catch it here. - and not ArgumentOutOfRangeException // Corresponds to IndexOutOfBoundsException (and subclasses), so we don't catch it here. - and not DecoderFallbackException // CharacterCodingException is an IOException in Java, maps to DecoderFallbackException and EncoderFallbackException in .NET - and not EncoderFallbackException; + return Lucene.ExceptionExtensions.IsIllegalArgumentException(e) + && e is not ArgumentNullException // Corresponds to NullPointerException, so we don't catch it here. + and not ArgumentOutOfRangeException; // Corresponds to IndexOutOfBoundsException (and subclasses), so we don't catch it here. } } }