From aed739c7ca115f886e69b280904ac635522eb19a Mon Sep 17 00:00:00 2001 From: mishako Date: Fri, 25 Dec 2015 17:46:07 +0100 Subject: [PATCH] Support CP1047 encoding --- .../java/com/rometools/rome/io/XmlReader.java | 18 +-- .../XmlReaderTest.java} | 147 +++++++++++++++++- 2 files changed, 146 insertions(+), 19 deletions(-) rename src/test/java/com/rometools/rome/{unittest/TestXmlReader.java => io/XmlReaderTest.java} (77%) diff --git a/src/main/java/com/rometools/rome/io/XmlReader.java b/src/main/java/com/rometools/rome/io/XmlReader.java index 511aa1f..5023f5c 100644 --- a/src/main/java/com/rometools/rome/io/XmlReader.java +++ b/src/main/java/com/rometools/rome/io/XmlReader.java @@ -66,6 +66,7 @@ public class XmlReader extends Reader { private static final String UTF_16BE = "UTF-16BE"; private static final String UTF_16LE = "UTF-16LE"; private static final String UTF_16 = "UTF-16"; + private static final String CP1047 = "CP1047"; private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)"); private static final Pattern ENCODING_PATTERN = Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE); private static final MessageFormat RAW_EX_1 = new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"); @@ -621,7 +622,7 @@ public class XmlReader extends Reader { } // returns the best guess for the encoding by looking the first bytes of the - // stream, ', NULL if none - private static String getXmlProlog(final BufferedInputStream is, final String guessedEnc) throws IOException { + static String getXmlProlog(final InputStream is, final String guessedEnc) throws IOException { String encoding = null; if (guessedEnc != null) { final byte[] bytes = new byte[BUFFER_SIZE]; @@ -656,7 +659,7 @@ public class XmlReader extends Reader { offset += c; max -= c; c = is.read(bytes, offset, max); - firstGT = new String(bytes, 0, offset).indexOf(">"); + firstGT = new String(bytes, 0, offset, guessedEnc).indexOf(">"); } if (firstGT == -1) { if (c == -1) { @@ -668,14 +671,7 @@ public class XmlReader extends Reader { final int bytesRead = offset; if (bytesRead > 0) { is.reset(); - final Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes, 0, firstGT + 1), guessedEnc); - final BufferedReader bReader = new BufferedReader(reader); - final StringBuffer prolog = new StringBuffer(); - String line = bReader.readLine(); - while (line != null) { - prolog.append(line); - line = bReader.readLine(); - } + String prolog = new String(bytes, guessedEnc).substring(0, firstGT); final Matcher m = ENCODING_PATTERN.matcher(prolog); if (m.find()) { encoding = m.group(1).toUpperCase(Locale.ENGLISH); diff --git a/src/test/java/com/rometools/rome/unittest/TestXmlReader.java b/src/test/java/com/rometools/rome/io/XmlReaderTest.java similarity index 77% rename from src/test/java/com/rometools/rome/unittest/TestXmlReader.java rename to src/test/java/com/rometools/rome/io/XmlReaderTest.java index 9a0eb1c..9b36d85 100644 --- a/src/test/java/com/rometools/rome/unittest/TestXmlReader.java +++ b/src/test/java/com/rometools/rome/io/XmlReaderTest.java @@ -14,7 +14,17 @@ * limitations under the License. * */ -package com.rometools.rome.unittest; + +package com.rometools.rome.io; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -22,27 +32,27 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.Writer; +import java.nio.charset.Charset; import java.text.MessageFormat; import java.util.HashMap; import java.util.Map; -import junit.framework.TestCase; - -import com.rometools.rome.io.XmlReader; - /** * @author pat, tucu - * */ -public class TestXmlReader extends TestCase { +public class XmlReaderTest { + private static final String XML5 = "xml-prolog-encoding-spaced-single-quotes"; private static final String XML4 = "xml-prolog-encoding-single-quotes"; private static final String XML3 = "xml-prolog-encoding-double-quotes"; private static final String XML2 = "xml-prolog"; private static final String XML1 = "xml"; + @Rule + public ExpectedException expectedException = ExpectedException.none(); + public static void main(final String[] args) throws Exception { - final TestXmlReader test = new TestXmlReader(); + final XmlReaderTest test = new XmlReaderTest(); test.testRawBom(); test.testRawNoBom(); test.testHttp(); @@ -92,10 +102,12 @@ public class TestXmlReader extends TestCase { } } + @Test public void testRawNoBom() throws Exception { testRawNoBomValid("US-ASCII"); testRawNoBomValid("UTF-8"); testRawNoBomValid("ISO-8859-1"); + testRawNoBomValid("CP1047"); } protected void testRawBomValid(final String encoding) throws Exception { @@ -120,6 +132,7 @@ public class TestXmlReader extends TestCase { } } + @Test public void testRawBom() throws Exception { testRawBomValid("UTF-8"); testRawBomValid("UTF-16BE"); @@ -136,6 +149,7 @@ public class TestXmlReader extends TestCase { testRawBomInvalid("UTF-16LE-bom", "UTF-16LE", "UTF-8"); } + @Test public void testHttp() throws Exception { testHttpValid("application/xml", "no-bom", "US-ASCII", null); testHttpValid("application/xml", "UTF-8-bom", "US-ASCII", null); @@ -271,6 +285,7 @@ public class TestXmlReader extends TestCase { private static final String ENCODING_ATTRIBUTE_XML = " \n" + "\n" + "\n" + " \n" + " ", "UTF-8"); + final String guessedEncoding = "UTF-8"; + + final String prologEncoding = XmlReader.getXmlProlog(input, guessedEncoding); + + assertEquals("TEST", prologEncoding); + } + + @Test + public void testGetXmlProlog_Utf8() throws IOException { + final InputStream input = stringToStream("", "UTF-8"); + final String guessedEncoding = "UTF-8"; + + final String prologEncoding = XmlReader.getXmlProlog(input, guessedEncoding); + + assertEquals("UTF-8", prologEncoding); + } + + @Test + public void testGetXmlProlog_Utf16() throws IOException { + final InputStream input = stringToStream("", "UTF-16"); + final String guessedEncoding = "UTF-16"; + + assertEquals("UTF-16", XmlReader.getXmlProlog(input, guessedEncoding)); + } + + @Test + public void testGetXmlProlog_Cp1047() throws IOException { + final InputStream input = stringToStream("", "CP1047"); + final String guessedEncoding = "CP1047"; + + assertEquals("CP1047", XmlReader.getXmlProlog(input, guessedEncoding)); + } + + @Test + public void testGetXmlProlog_NoEncoding() throws IOException { + final InputStream input = stringToStream("", "UTF-8"); + final String guessedEncoding = "UTF-8"; + + assertNull(XmlReader.getXmlProlog(input, guessedEncoding)); + } + + @Test + public void testGetXmlProlog_GuessedIsNull() throws IOException { + final InputStream input = stringToStream("", "UTF-8"); + final String guessedEncoding = null; + + assertNull(XmlReader.getXmlProlog(input, guessedEncoding)); + } + + @Test + public void testGetXmlProlog_UppercaseResult() throws IOException { + final InputStream input = stringToStream("", "UTF-8"); + final String guessedEncoding = "UTF-8"; + + assertEquals("UTF-8", XmlReader.getXmlProlog(input, guessedEncoding)); + } + + @Test + public void testGetXmlProlog_DifferentAsciiCompatible() throws IOException { + final InputStream input = stringToStream("", "ISO-8859-1"); + final String guessedEncoding = "UTF-8"; + + assertEquals("TEST", XmlReader.getXmlProlog(input, guessedEncoding)); + } + + @Test + public void testGetXmlProlog_DifferentAsciiIncompatible() throws IOException { + final InputStream input = stringToStream("", "UTF-16BE"); + final String guessedEncoding = "UTF-16LE"; + + expectedException.expect(IOException.class); + + XmlReader.getXmlProlog(input, guessedEncoding); + } + + @Test + public void testGetXmlProlog_NoClosingAngleBracket() throws IOException { + final InputStream input = stringToStream("", "UTF-8"); + final String guessedEncoding = "UTF-8"; + + expectedException.expect(IOException.class); + + XmlReader.getXmlProlog(input, guessedEncoding); + } + + static InputStream stringToStream(String string, String encoding) { + return new ByteArrayInputStream(string.getBytes(Charset.forName(encoding))); + } }