Fix for issue 5850: Journal abbreviations in UTF-8 not recognized (#7639

) * fix issue #5850 for encoding problem * add a blank line for build.gradle * initial as main branch for build.gradle * initial as main branch for build.gradle * add the change of fix information of issue 5850 * Fix check style * Update CHANGELOG.md Co-authored-by: Christoph <siedlerkiller@gmail.com> * Add the utf8 check for biblatex and ascii check for bibtex * add the new localization string the l10 files * fix error * add the statement only in en.properties * revert changes * Update JabRef_da.properties * Update JabRef_ru.properties * Update build.gradle * Update JabRef_fa.properties * Update JabRef_no.properties * Update JabRef_pl.properties * Update JabRef_pt.properties * Update JabRef_vi.properties * Update JabRef_zh_TW.properties * reset the default charset * reset the default charset * add the javaDoc of UTF8Checker * add the javaDoc of UTF8CheckerTest and IntegrityCheckTest add 2 Junit Test for UTF8Checker.UTF8EncodingChecker in UTF8CheckerTest add 2 Junit Test for IntegrityCheck in IntegrityCheckTest * Remove the unwieldy Junit tests Co-authored-by: Christoph <siedlerkiller@gmail.com>
JabRef · Apr 23, 2021 · 434250d · 434250d
1 parent 2af578f
commit 434250d
Show file tree

Hide file tree

Showing 6 changed files with 140 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -77,6 +77,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
 - We fixed an issue where opening BibTex file (doubleclick) from Folder with spaces not working. [#6487](https://github.com/JabRef/jabref/issues/6487)
 - We fixed an issue with saving large `.bib` files [#7265](https://github.com/JabRef/jabref/issues/7265)
 - We fixed an issue with very large page numbers [#7590](https://github.com/JabRef/jabref/issues/7590)
+- We fixed an issue where journal abbreviations in UTF-8 were not recognized [#5850](https://github.com/JabRef/jabref/issues/5850)
 - We fixed an issue where the article title with curly brackets fails to download the arXiv link (pdf file). [#7633](https://github.com/JabRef/jabref/issues/7633)
 
 ### Removed

diff --git a/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java b/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java
@@ -38,9 +38,11 @@ public IntegrityCheck(BibDatabaseContext bibDatabaseContext,
                 new CitationKeyDeviationChecker(bibDatabaseContext, citationKeyPatternPreferences),
                 new CitationKeyDuplicationChecker(bibDatabaseContext.getDatabase())
         ));
-
         if (bibDatabaseContext.isBiblatexMode()) {
-            entryCheckers.add(new JournalInAbbreviationListChecker(StandardField.JOURNALTITLE, journalAbbreviationRepository));
+            entryCheckers.addAll(List.of(
+                    new JournalInAbbreviationListChecker(StandardField.JOURNALTITLE, journalAbbreviationRepository),
+                    new UTF8Checker())
+            );
         } else {
             entryCheckers.addAll(List.of(
                     new JournalInAbbreviationListChecker(StandardField.JOURNAL, journalAbbreviationRepository),
@@ -59,7 +61,6 @@ List<IntegrityMessage> check() {
         for (BibEntry entry : database.getEntries()) {
             result.addAll(checkEntry(entry));
         }
-
         result.addAll(checkDatabase(database));
 
         return result;

diff --git a/src/main/java/org/jabref/logic/integrity/UTF8Checker.java b/src/main/java/org/jabref/logic/integrity/UTF8Checker.java
@@ -0,0 +1,53 @@
+package org.jabref.logic.integrity;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.jabref.logic.l10n.Localization;
+import org.jabref.model.entry.BibEntry;
+import org.jabref.model.entry.field.Field;
+
+public class UTF8Checker implements EntryChecker {
+
+    /**
+     * Detect any non UTF-8 encoded field
+     * @param entry the BibEntry of BibLatex.
+     * @return return the warning of UTF-8 check for BibLatex.
+     */
+    @Override
+    public List<IntegrityMessage> check(BibEntry entry) {
+        List<IntegrityMessage> results = new ArrayList<>();
+        Charset charset = Charset.forName(System.getProperty("file.encoding"));
+        for (Map.Entry<Field, String> field : entry.getFieldMap().entrySet()) {
+            boolean utfOnly = UTF8EncodingChecker(field.getValue().getBytes(charset));
+            if (!utfOnly) {
+                results.add(new IntegrityMessage(Localization.lang("Non-UTF-8 encoded field found"), entry,
+                        field.getKey()));
+            }
+        }
+        return results;
+    }
+
+    /**
+     * Check whether a byte array is encoded in UTF-8 charset
+     *
+     * Use java api decoder and try&catch block to check the charset.
+     * @param data the byte array used to check the encoding charset
+     * @return true if is encoded in UTF-8 & false is not encoded in UTF-8
+     */
+    public static boolean UTF8EncodingChecker(byte[] data) {
+        CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
+        try {
+            decoder.decode(ByteBuffer.wrap(data));
+        } catch (CharacterCodingException ex) {
+            return false;
+        }
+        return true;
+    }
+}
diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties
@@ -1629,6 +1629,7 @@ Style\ file=Style file
 Open\ OpenOffice/LibreOffice\ connection=Open OpenOffice/LibreOffice connection
 You\ must\ enter\ at\ least\ one\ field\ name=You must enter at least one field name
 Non-ASCII\ encoded\ character\ found=Non-ASCII encoded character found
+Non-UTF-8\ encoded\ field\ found=Non-UTF-8 encoded field found
 Toggle\ web\ search\ interface=Toggle web search interface
 
 Migration\ help\ information=Migration help information

diff --git a/src/test/java/org/jabref/logic/integrity/IntegrityCheckTest.java b/src/test/java/org/jabref/logic/integrity/IntegrityCheckTest.java
@@ -87,9 +87,9 @@ private static Stream<String> provideCorrectFormat() {
 
     private static Stream<String> provideIncorrectFormat() {
         return Stream.of("   Knuth, Donald E. ",
-                         "Knuth, Donald E. and Kurt Cobain and A. Einstein",
-                         ", and Kurt Cobain and A. Einstein", "Donald E. Knuth and Kurt Cobain and ,",
-                         "and Kurt Cobain and A. Einstein", "Donald E. Knuth and Kurt Cobain and");
+                "Knuth, Donald E. and Kurt Cobain and A. Einstein",
+                ", and Kurt Cobain and A. Einstein", "Donald E. Knuth and Kurt Cobain and ,",
+                "and Kurt Cobain and A. Einstein", "Donald E. Knuth and Kurt Cobain and");
     }
 
     @Test
@@ -190,10 +190,10 @@ private void assertCorrect(BibDatabaseContext context) {
 
     private void assertCorrect(BibDatabaseContext context, boolean allowIntegerEdition) {
         List<IntegrityMessage> messages = new IntegrityCheck(context,
-                                                             mock(FilePreferences.class),
-                                                             createCitationKeyPatternPreferences(),
-                                                             JournalAbbreviationLoader.loadBuiltInRepository(),
-                                                             allowIntegerEdition).check();
+                mock(FilePreferences.class),
+                createCitationKeyPatternPreferences(),
+                JournalAbbreviationLoader.loadBuiltInRepository(),
+                allowIntegerEdition).check();
         assertEquals(Collections.emptyList(), messages);
     }
 

diff --git a/src/test/java/org/jabref/logic/integrity/UTF8CheckerTest.java b/src/test/java/org/jabref/logic/integrity/UTF8CheckerTest.java
@@ -0,0 +1,74 @@
+package org.jabref.logic.integrity;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.List;
+
+import org.jabref.model.entry.BibEntry;
+import org.jabref.model.entry.field.StandardField;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class UTF8CheckerTest {
+
+    private final BibEntry entry = new BibEntry();
+
+    /**
+     * fieldAcceptsUTF8 to check UTF8Checker's result set
+     * when the entry is encoded in UTF-8 (should be empty)
+     */
+    @Test
+    void fieldAcceptsUTF8() {
+        UTF8Checker checker = new UTF8Checker();
+        entry.setField(StandardField.TITLE, "Only ascii characters!'@12");
+        assertEquals(Collections.emptyList(), checker.check(entry));
+    }
+
+    /**
+     * fieldDoesNotAcceptUmlauts to check UTF8Checker's result set
+     * when the entry is encoded in Non-Utf-8 charset and the System
+     * environment is Non UTF-8.
+     * Finally we need to reset the environment charset.
+     * @throws UnsupportedEncodingException initial a String in charset GBK
+     * Demo: new String(StringDemo.getBytes(), "GBK");
+     */
+    @Test
+    void fieldDoesNotAcceptUmlauts() throws UnsupportedEncodingException {
+        String defaultCharset = System.getProperty("file.encoding");
+        System.getProperties().put("file.encoding", "GBK");
+        UTF8Checker checker = new UTF8Checker();
+        String NonUTF8 = new String("你好，这条语句使用GBK字符集".getBytes(), "GBK");
+        entry.setField(StandardField.MONTH, NonUTF8);
+        assertEquals(List.of(new IntegrityMessage("Non-UTF-8 encoded field found", entry, StandardField.MONTH)), checker.check(entry));
+        System.getProperties().put("file.encoding", defaultCharset);
+    }
+
+    /**
+     * To check the UTF8Checker.UTF8EncodingChecker
+     * in NonUTF8 char array (should return false)
+     *
+     * @throws UnsupportedEncodingException initial a String in charset GBK
+     * Demo: new String(StringDemo.getBytes(), "GBK");
+     */
+    @Test
+    void NonUTF8EncodingCheckerTest() throws UnsupportedEncodingException {
+        String NonUTF8 = new String("你好，这条语句使用GBK字符集".getBytes(), "GBK");
+            assertFalse(UTF8Checker.UTF8EncodingChecker(NonUTF8.getBytes("GBK")));
+
+    }
+
+    /**
+     * To check the UTF8Checker.UTF8EncodingChecker
+     * in UTF-8 char array (should return true)
+     */
+    @Test
+    void UTF8EncodingCheckerTest() {
+        String UTF8Demo = new String("你好，这条语句使用GBK字符集".getBytes(), StandardCharsets.UTF_8);
+            assertTrue(UTF8Checker.UTF8EncodingChecker(UTF8Demo.getBytes(StandardCharsets.UTF_8)));
+    }
+}