Skip to content

Commit 300a4e4

Browse files
gnodetbelingueres
andauthored
Fix parsing an UTF-8 file without BOM and ISO-8859-1 encoding (#1)
* do not try to discover the encoding used when the input is given a Reader * simplified test-encoding-ISO-8859-1.xml test file * fixed tests exercising encoding checks. Unsupported tests were skipped --------- Co-authored-by: Gabriel Belingueres <belingueres@gmail.com>
1 parent 31016cd commit 300a4e4

File tree

5 files changed

+127
-1563
lines changed

5 files changed

+127
-1563
lines changed

Diff for: src/main/java/org/codehaus/plexus/util/xml/XmlReader.java

+2-5
Original file line numberDiff line numberDiff line change
@@ -523,11 +523,8 @@ else if ( bomEnc.equals( UTF_8 ) )
523523
}
524524
else if ( bomEnc.equals( UTF_16BE ) || bomEnc.equals( UTF_16LE ) )
525525
{
526-
if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc ) )
527-
{
528-
throw new IOException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ) );
529-
}
530-
if ( xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
526+
if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc )
527+
|| xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
531528
{
532529
throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ),
533530
bomEnc, xmlGuessEnc, xmlEnc, is );

Diff for: src/main/java/org/codehaus/plexus/util/xml/pull/MXParser.java

+14-28
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,9 @@
1515
import java.io.Reader;
1616
import java.io.UnsupportedEncodingException;
1717

18-
import org.codehaus.plexus.util.xml.XmlReader;
1918
import org.codehaus.plexus.util.xml.XmlStreamReader;
19+
import org.codehaus.plexus.util.xml.XmlStreamReaderException;
2020

21-
//import java.util.Hashtable;
2221

2322
//TODO best handling of interning issues
2423
// have isAllNewStringInterned ???
@@ -663,20 +662,6 @@ public void setInput( Reader in )
663662
{
664663
reset();
665664
reader = in;
666-
667-
if ( reader instanceof XmlReader ) {
668-
// encoding already detected
669-
XmlReader xsr = (XmlReader) reader;
670-
fileEncoding = xsr.getEncoding();
671-
}
672-
else if ( reader instanceof InputStreamReader )
673-
{
674-
InputStreamReader isr = (InputStreamReader) reader;
675-
if ( isr.getEncoding() != null )
676-
{
677-
fileEncoding = isr.getEncoding().toUpperCase();
678-
}
679-
}
680665
}
681666

682667
@Override
@@ -696,14 +681,26 @@ public void setInput( java.io.InputStream inputStream, String inputEncoding )
696681
}
697682
else
698683
{
699-
reader = new XmlStreamReader( inputStream );
684+
reader = new XmlStreamReader( inputStream, false );
700685
}
701686
}
702687
catch ( UnsupportedEncodingException une )
703688
{
704689
throw new XmlPullParserException( "could not create reader for encoding " + inputEncoding + " : " + une,
705690
this, une );
706691
}
692+
catch ( XmlStreamReaderException e )
693+
{
694+
if ( "UTF-8".equals( e.getBomEncoding() ) )
695+
{
696+
throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + e.getXmlEncoding() + " is incompatible", this, e );
697+
}
698+
if ( e.getBomEncoding() != null && e.getBomEncoding().startsWith( "UTF-16" ) )
699+
{
700+
throw new XmlPullParserException( "UTF-16 BOM in a " + e.getXmlEncoding() + " encoded file is incompatible", this, e );
701+
}
702+
throw new XmlPullParserException( "could not create reader : " + e, this, e );
703+
}
707704
catch ( IOException e )
708705
{
709706
throw new XmlPullParserException( "could not create reader : " + e, this, e );
@@ -3434,17 +3431,6 @@ private void parseXmlDeclWithVersion( int versionStart, int versionEnd )
34343431
// TODO reconcile with setInput encodingName
34353432
inputEncoding = newString( buf, encodingStart, encodingEnd - encodingStart );
34363433

3437-
if ( "UTF8".equals( fileEncoding ) && inputEncoding.toUpperCase().startsWith( "ISO-" ) )
3438-
{
3439-
throw new XmlPullParserException( "UTF-8 BOM plus xml decl of " + inputEncoding + " is incompatible",
3440-
this, null );
3441-
}
3442-
else if ("UTF-16".equals( fileEncoding ) && inputEncoding.equalsIgnoreCase( "UTF-8" ))
3443-
{
3444-
throw new XmlPullParserException( "UTF-16 BOM plus xml decl of " + inputEncoding + " is incompatible",
3445-
this, null );
3446-
}
3447-
34483434
lastParsedAttr = "encoding";
34493435

34503436
ch = more();

Diff for: src/test/java/org/codehaus/plexus/util/xml/pull/MXParserTest.java

+95-9
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,13 @@
2323

2424
import java.io.EOFException;
2525
import java.io.File;
26+
import java.io.FileInputStream;
2627
import java.io.IOException;
2728
import java.io.InputStream;
29+
import java.io.InputStreamReader;
2830
import java.io.Reader;
2931
import java.io.StringReader;
32+
import java.nio.charset.StandardCharsets;
3033
import java.nio.file.Files;
3134
import java.nio.file.Paths;
3235

@@ -968,7 +971,7 @@ public void testXMLDeclVersionEncodingStandaloneNoSpace()
968971
* @since 3.4.1
969972
*/
970973
@Test
971-
public void testEncodingISO_8859_1setInputReader()
974+
public void testEncodingISO_8859_1_newXmlReader()
972975
throws IOException
973976
{
974977
try ( Reader reader =
@@ -994,7 +997,7 @@ public void testEncodingISO_8859_1setInputReader()
994997
* @since 3.4.1
995998
*/
996999
@Test
997-
public void testEncodingISO_8859_1_setInputStream()
1000+
public void testEncodingISO_8859_1_InputStream()
9981001
throws IOException
9991002
{
10001003
try ( InputStream input =
@@ -1012,12 +1015,6 @@ public void testEncodingISO_8859_1_setInputStream()
10121015
}
10131016
}
10141017

1015-
private static void assertPosition( int row, int col, MXParser parser )
1016-
{
1017-
assertEquals( "Current line", row, parser.getLineNumber() );
1018-
assertEquals( "Current column", col, parser.getColumnNumber() );
1019-
}
1020-
10211018
/**
10221019
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
10231020
*
@@ -1028,7 +1025,7 @@ private static void assertPosition( int row, int col, MXParser parser )
10281025
* @since 3.4.2
10291026
*/
10301027
@Test
1031-
public void testEncodingISO_8859_1setStringReader()
1028+
public void testEncodingISO_8859_1_StringReader()
10321029
throws IOException
10331030
{
10341031
String xmlFileContents;
@@ -1050,6 +1047,95 @@ public void testEncodingISO_8859_1setStringReader()
10501047
}
10511048
}
10521049

1050+
/**
1051+
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
1052+
*
1053+
* Another case of bug #163: Reader generated with ReaderFactory.newReader and the right file encoding.
1054+
*
1055+
* @throws IOException if IO error.
1056+
*
1057+
* @since 3.5.2
1058+
*/
1059+
@Test
1060+
public void testEncodingISO_8859_1_newReader()
1061+
throws IOException
1062+
{
1063+
// NOTE: if using Files.newBufferedReader(path, StandardCharsets.UTF-8), the reader will throw an exception
1064+
// because the decoder created by new InputStreamReader() is lenient while the one created by
1065+
// Files.newBufferedReader() is not.
1066+
try ( Reader reader = new InputStreamReader( Files.newInputStream(
1067+
Paths.get( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ),
1068+
StandardCharsets.UTF_8 ) )
1069+
{
1070+
MXParser parser = new MXParser();
1071+
parser.setInput( reader );
1072+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
1073+
;
1074+
assertTrue( true );
1075+
}
1076+
catch ( XmlPullParserException e )
1077+
{
1078+
fail( "should not raise exception: " + e );
1079+
}
1080+
}
1081+
1082+
/**
1083+
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
1084+
*
1085+
* Another case of bug #163: InputStream supplied with the right file encoding.
1086+
*
1087+
* @throws IOException if IO error.
1088+
*
1089+
* @since 3.5.2
1090+
*/
1091+
@Test
1092+
public void testEncodingISO_8859_1_InputStream_encoded() throws IOException {
1093+
try ( InputStream input =
1094+
Files.newInputStream( Paths.get( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
1095+
{
1096+
MXParser parser = new MXParser();
1097+
parser.setInput( input, StandardCharsets.UTF_8.name() );
1098+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
1099+
;
1100+
assertTrue( true );
1101+
}
1102+
catch ( XmlPullParserException e )
1103+
{
1104+
fail( "should not raise exception: " + e );
1105+
}
1106+
}
1107+
1108+
/**
1109+
* Issue 163: https://github.com/codehaus-plexus/plexus-utils/issues/163
1110+
*
1111+
* @throws IOException if IO error.
1112+
*
1113+
* @since 3.4.1
1114+
*/
1115+
@Test
1116+
public void testEncodingUTF8_newXmlReader()
1117+
throws IOException
1118+
{
1119+
try ( Reader reader = new XmlStreamReader( Paths.get( "src/test/resources/xml", "test-encoding-ISO-8859-1.xml" ) ) )
1120+
{
1121+
MXParser parser = new MXParser();
1122+
parser.setInput( reader );
1123+
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
1124+
;
1125+
assertTrue( true );
1126+
}
1127+
catch ( XmlPullParserException e )
1128+
{
1129+
fail( "should not raise exception: " + e );
1130+
}
1131+
}
1132+
1133+
private static void assertPosition( int row, int col, MXParser parser )
1134+
{
1135+
assertEquals( "Current line", row, parser.getLineNumber() );
1136+
assertEquals( "Current column", col, parser.getColumnNumber() );
1137+
}
1138+
10531139
/**
10541140
* <p>
10551141
* Test custom Entity not found.

Diff for: src/test/java/org/codehaus/plexus/util/xml/pull/eduni_misc_Test_BjoernHoehrmannviaHST2013_09_18_Test.java

+14-19
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,8 @@
77
import java.io.FileInputStream;
88
import java.io.FileReader;
99
import java.io.IOException;
10-
import java.io.InputStreamReader;
10+
import java.io.InputStream;
1111
import java.io.Reader;
12-
import java.nio.charset.StandardCharsets;
13-
1412
import org.junit.Before;
1513
import org.junit.Test;
1614

@@ -212,17 +210,16 @@ public void testhst_bh_006()
212210
public void testhst_lhs_007()
213211
throws IOException
214212
{
215-
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) );
216-
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
213+
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "007.xml" ) ) )
217214
{
218-
parser.setInput( reader );
215+
parser.setInput( is, null );
219216
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
220217
;
221-
fail( "UTF-8 BOM plus xml decl of iso-8859-1 incompatible" );
218+
fail( "UTF-8 BOM plus xml decl of ISO-8859-1 incompatible" );
222219
}
223220
catch ( XmlPullParserException e )
224221
{
225-
assertTrue( e.getMessage().contains( "UTF-8 BOM plus xml decl of iso-8859-1 is incompatible" ) );
222+
assertTrue( e.getMessage().contains( "UTF-8 BOM plus xml decl of ISO-8859-1 is incompatible" ) );
226223
}
227224
}
228225

@@ -239,17 +236,16 @@ public void testhst_lhs_007()
239236
public void testhst_lhs_008()
240237
throws IOException
241238
{
242-
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) );
243-
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_16 ) )
239+
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "008.xml" ) ) )
244240
{
245-
parser.setInput( reader );
241+
parser.setInput( is, null );
246242
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
247243
;
248-
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-16 coding) incompatible" );
244+
fail( "UTF-16 BOM plus xml decl of UTF-8 (using UTF-16 coding) incompatible" );
249245
}
250246
catch ( XmlPullParserException e )
251247
{
252-
assertTrue( e.getMessage().contains( "UTF-16 BOM plus xml decl of utf-8 is incompatible" ) );
248+
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
253249
}
254250
}
255251

@@ -266,17 +262,16 @@ public void testhst_lhs_008()
266262
public void testhst_lhs_009()
267263
throws IOException
268264
{
269-
try ( FileInputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) );
270-
InputStreamReader reader = new InputStreamReader( is, StandardCharsets.UTF_8 ) )
271-
{
272-
parser.setInput( reader );
265+
try ( InputStream is = new FileInputStream( new File( testResourcesDir, "009.xml" ) ) )
266+
{
267+
parser.setInput( is, null );
273268
while ( parser.nextToken() != XmlPullParser.END_DOCUMENT )
274269
;
275-
fail( "UTF-16 BOM plus xml decl of utf-8 (using UTF-8 coding) incompatible" );
270+
fail( "UTF-16 BOM plus xml decl of UTF-8 (using UTF-8 coding) incompatible" );
276271
}
277272
catch ( XmlPullParserException e )
278273
{
279-
assertTrue( e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
274+
assertTrue( e.getMessage(), e.getMessage().contains( "UTF-16 BOM in a UTF-8 encoded file is incompatible" ) );
280275
}
281276
}
282277

0 commit comments

Comments
 (0)