Skip to content

Commit

Permalink
Fix for issue 5850: Journal abbreviations in UTF-8 not recognized (#7639
Browse files Browse the repository at this point in the history
)

* fix issue #5850 for encoding problem

* add a blank line for build.gradle

* initial as main branch for build.gradle

* initial as main branch for build.gradle

* add the change of fix information of issue 5850

* Fix check style

* Update CHANGELOG.md

Co-authored-by: Christoph <siedlerkiller@gmail.com>

* Add the utf8 check for biblatex and ascii check for bibtex

* add the new localization string the l10 files

* fix error

* add the statement only in en.properties

* revert changes

* Update JabRef_da.properties

* Update JabRef_ru.properties

* Update build.gradle

* Update JabRef_fa.properties

* Update JabRef_no.properties

* Update JabRef_pl.properties

* Update JabRef_pt.properties

* Update JabRef_vi.properties

* Update JabRef_zh_TW.properties

* reset the default charset

* reset the default charset

* add the javaDoc of UTF8Checker

* add the javaDoc of UTF8CheckerTest and IntegrityCheckTest

add 2 Junit Test for UTF8Checker.UTF8EncodingChecker in UTF8CheckerTest

add 2 Junit Test for IntegrityCheck in IntegrityCheckTest

* Remove the unwieldy Junit tests

Co-authored-by: Christoph <siedlerkiller@gmail.com>
  • Loading branch information
MrGhabi and Siedlerchr authored Apr 23, 2021
1 parent 2af578f commit 434250d
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
- We fixed an issue where opening BibTex file (doubleclick) from Folder with spaces not working. [#6487](https://github.com/JabRef/jabref/issues/6487)
- We fixed an issue with saving large `.bib` files [#7265](https://github.com/JabRef/jabref/issues/7265)
- We fixed an issue with very large page numbers [#7590](https://github.com/JabRef/jabref/issues/7590)
- We fixed an issue where journal abbreviations in UTF-8 were not recognized [#5850](https://github.com/JabRef/jabref/issues/5850)
- We fixed an issue where the article title with curly brackets fails to download the arXiv link (pdf file). [#7633](https://github.com/JabRef/jabref/issues/7633)

### Removed
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/org/jabref/logic/integrity/IntegrityCheck.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,11 @@ public IntegrityCheck(BibDatabaseContext bibDatabaseContext,
new CitationKeyDeviationChecker(bibDatabaseContext, citationKeyPatternPreferences),
new CitationKeyDuplicationChecker(bibDatabaseContext.getDatabase())
));

if (bibDatabaseContext.isBiblatexMode()) {
entryCheckers.add(new JournalInAbbreviationListChecker(StandardField.JOURNALTITLE, journalAbbreviationRepository));
entryCheckers.addAll(List.of(
new JournalInAbbreviationListChecker(StandardField.JOURNALTITLE, journalAbbreviationRepository),
new UTF8Checker())
);
} else {
entryCheckers.addAll(List.of(
new JournalInAbbreviationListChecker(StandardField.JOURNAL, journalAbbreviationRepository),
Expand All @@ -59,7 +61,6 @@ List<IntegrityMessage> check() {
for (BibEntry entry : database.getEntries()) {
result.addAll(checkEntry(entry));
}

result.addAll(checkDatabase(database));

return result;
Expand Down
53 changes: 53 additions & 0 deletions src/main/java/org/jabref/logic/integrity/UTF8Checker.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package org.jabref.logic.integrity;

import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.Field;

public class UTF8Checker implements EntryChecker {

/**
* Detect any non UTF-8 encoded field
* @param entry the BibEntry of BibLatex.
* @return return the warning of UTF-8 check for BibLatex.
*/
@Override
public List<IntegrityMessage> check(BibEntry entry) {
List<IntegrityMessage> results = new ArrayList<>();
Charset charset = Charset.forName(System.getProperty("file.encoding"));
for (Map.Entry<Field, String> field : entry.getFieldMap().entrySet()) {
boolean utfOnly = UTF8EncodingChecker(field.getValue().getBytes(charset));
if (!utfOnly) {
results.add(new IntegrityMessage(Localization.lang("Non-UTF-8 encoded field found"), entry,
field.getKey()));
}
}
return results;
}

/**
* Check whether a byte array is encoded in UTF-8 charset
*
* Use java api decoder and try&catch block to check the charset.
* @param data the byte array used to check the encoding charset
* @return true if is encoded in UTF-8 & false is not encoded in UTF-8
*/
public static boolean UTF8EncodingChecker(byte[] data) {
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
try {
decoder.decode(ByteBuffer.wrap(data));
} catch (CharacterCodingException ex) {
return false;
}
return true;
}
}
1 change: 1 addition & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -1629,6 +1629,7 @@ Style\ file=Style file
Open\ OpenOffice/LibreOffice\ connection=Open OpenOffice/LibreOffice connection
You\ must\ enter\ at\ least\ one\ field\ name=You must enter at least one field name
Non-ASCII\ encoded\ character\ found=Non-ASCII encoded character found
Non-UTF-8\ encoded\ field\ found=Non-UTF-8 encoded field found
Toggle\ web\ search\ interface=Toggle web search interface
Migration\ help\ information=Migration help information
Expand Down
14 changes: 7 additions & 7 deletions src/test/java/org/jabref/logic/integrity/IntegrityCheckTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,9 @@ private static Stream<String> provideCorrectFormat() {

private static Stream<String> provideIncorrectFormat() {
return Stream.of(" Knuth, Donald E. ",
"Knuth, Donald E. and Kurt Cobain and A. Einstein",
", and Kurt Cobain and A. Einstein", "Donald E. Knuth and Kurt Cobain and ,",
"and Kurt Cobain and A. Einstein", "Donald E. Knuth and Kurt Cobain and");
"Knuth, Donald E. and Kurt Cobain and A. Einstein",
", and Kurt Cobain and A. Einstein", "Donald E. Knuth and Kurt Cobain and ,",
"and Kurt Cobain and A. Einstein", "Donald E. Knuth and Kurt Cobain and");
}

@Test
Expand Down Expand Up @@ -190,10 +190,10 @@ private void assertCorrect(BibDatabaseContext context) {

private void assertCorrect(BibDatabaseContext context, boolean allowIntegerEdition) {
List<IntegrityMessage> messages = new IntegrityCheck(context,
mock(FilePreferences.class),
createCitationKeyPatternPreferences(),
JournalAbbreviationLoader.loadBuiltInRepository(),
allowIntegerEdition).check();
mock(FilePreferences.class),
createCitationKeyPatternPreferences(),
JournalAbbreviationLoader.loadBuiltInRepository(),
allowIntegerEdition).check();
assertEquals(Collections.emptyList(), messages);
}

Expand Down
74 changes: 74 additions & 0 deletions src/test/java/org/jabref/logic/integrity/UTF8CheckerTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package org.jabref.logic.integrity;

import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;

import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

public class UTF8CheckerTest {

private final BibEntry entry = new BibEntry();

/**
* fieldAcceptsUTF8 to check UTF8Checker's result set
* when the entry is encoded in UTF-8 (should be empty)
*/
@Test
void fieldAcceptsUTF8() {
UTF8Checker checker = new UTF8Checker();
entry.setField(StandardField.TITLE, "Only ascii characters!'@12");
assertEquals(Collections.emptyList(), checker.check(entry));
}

/**
* fieldDoesNotAcceptUmlauts to check UTF8Checker's result set
* when the entry is encoded in Non-Utf-8 charset and the System
* environment is Non UTF-8.
* Finally we need to reset the environment charset.
* @throws UnsupportedEncodingException initial a String in charset GBK
* Demo: new String(StringDemo.getBytes(), "GBK");
*/
@Test
void fieldDoesNotAcceptUmlauts() throws UnsupportedEncodingException {
String defaultCharset = System.getProperty("file.encoding");
System.getProperties().put("file.encoding", "GBK");
UTF8Checker checker = new UTF8Checker();
String NonUTF8 = new String("你好,这条语句使用GBK字符集".getBytes(), "GBK");
entry.setField(StandardField.MONTH, NonUTF8);
assertEquals(List.of(new IntegrityMessage("Non-UTF-8 encoded field found", entry, StandardField.MONTH)), checker.check(entry));
System.getProperties().put("file.encoding", defaultCharset);
}

/**
* To check the UTF8Checker.UTF8EncodingChecker
* in NonUTF8 char array (should return false)
*
* @throws UnsupportedEncodingException initial a String in charset GBK
* Demo: new String(StringDemo.getBytes(), "GBK");
*/
@Test
void NonUTF8EncodingCheckerTest() throws UnsupportedEncodingException {
String NonUTF8 = new String("你好,这条语句使用GBK字符集".getBytes(), "GBK");
assertFalse(UTF8Checker.UTF8EncodingChecker(NonUTF8.getBytes("GBK")));

}

/**
* To check the UTF8Checker.UTF8EncodingChecker
* in UTF-8 char array (should return true)
*/
@Test
void UTF8EncodingCheckerTest() {
String UTF8Demo = new String("你好,这条语句使用GBK字符集".getBytes(), StandardCharsets.UTF_8);
assertTrue(UTF8Checker.UTF8EncodingChecker(UTF8Demo.getBytes(StandardCharsets.UTF_8)));
}
}

0 comments on commit 434250d

Please sign in to comment.