Support CP1047 encoding
This commit is contained in:
parent
b0f3908956
commit
aed739c7ca
2 changed files with 146 additions and 19 deletions
|
@ -66,6 +66,7 @@ public class XmlReader extends Reader {
|
||||||
private static final String UTF_16BE = "UTF-16BE";
|
private static final String UTF_16BE = "UTF-16BE";
|
||||||
private static final String UTF_16LE = "UTF-16LE";
|
private static final String UTF_16LE = "UTF-16LE";
|
||||||
private static final String UTF_16 = "UTF-16";
|
private static final String UTF_16 = "UTF-16";
|
||||||
|
private static final String CP1047 = "CP1047";
|
||||||
private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
|
private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
|
||||||
private static final Pattern ENCODING_PATTERN = Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE);
|
private static final Pattern ENCODING_PATTERN = Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE);
|
||||||
private static final MessageFormat RAW_EX_1 = new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
|
private static final MessageFormat RAW_EX_1 = new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
|
||||||
|
@ -621,7 +622,7 @@ public class XmlReader extends Reader {
|
||||||
}
|
}
|
||||||
|
|
||||||
// returns the best guess for the encoding by looking the first bytes of the
|
// returns the best guess for the encoding by looking the first bytes of the
|
||||||
// stream, '<?'
|
// stream, '<?xm'
|
||||||
private static String getXMLGuessEncoding(final BufferedInputStream is) throws IOException {
|
private static String getXMLGuessEncoding(final BufferedInputStream is) throws IOException {
|
||||||
String encoding = null;
|
String encoding = null;
|
||||||
final int[] bytes = new int[4];
|
final int[] bytes = new int[4];
|
||||||
|
@ -638,12 +639,14 @@ public class XmlReader extends Reader {
|
||||||
encoding = UTF_16LE;
|
encoding = UTF_16LE;
|
||||||
} else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
|
} else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
|
||||||
encoding = UTF_8;
|
encoding = UTF_8;
|
||||||
|
} else if (bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7 && bytes[3] == 0x94) {
|
||||||
|
encoding = CP1047;
|
||||||
}
|
}
|
||||||
return encoding;
|
return encoding;
|
||||||
}
|
}
|
||||||
|
|
||||||
// returns the encoding declared in the <?xml encoding=...?>, NULL if none
|
// returns the encoding declared in the <?xml encoding=...?>, NULL if none
|
||||||
private static String getXmlProlog(final BufferedInputStream is, final String guessedEnc) throws IOException {
|
static String getXmlProlog(final InputStream is, final String guessedEnc) throws IOException {
|
||||||
String encoding = null;
|
String encoding = null;
|
||||||
if (guessedEnc != null) {
|
if (guessedEnc != null) {
|
||||||
final byte[] bytes = new byte[BUFFER_SIZE];
|
final byte[] bytes = new byte[BUFFER_SIZE];
|
||||||
|
@ -656,7 +659,7 @@ public class XmlReader extends Reader {
|
||||||
offset += c;
|
offset += c;
|
||||||
max -= c;
|
max -= c;
|
||||||
c = is.read(bytes, offset, max);
|
c = is.read(bytes, offset, max);
|
||||||
firstGT = new String(bytes, 0, offset).indexOf(">");
|
firstGT = new String(bytes, 0, offset, guessedEnc).indexOf(">");
|
||||||
}
|
}
|
||||||
if (firstGT == -1) {
|
if (firstGT == -1) {
|
||||||
if (c == -1) {
|
if (c == -1) {
|
||||||
|
@ -668,14 +671,7 @@ public class XmlReader extends Reader {
|
||||||
final int bytesRead = offset;
|
final int bytesRead = offset;
|
||||||
if (bytesRead > 0) {
|
if (bytesRead > 0) {
|
||||||
is.reset();
|
is.reset();
|
||||||
final Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes, 0, firstGT + 1), guessedEnc);
|
String prolog = new String(bytes, guessedEnc).substring(0, firstGT);
|
||||||
final BufferedReader bReader = new BufferedReader(reader);
|
|
||||||
final StringBuffer prolog = new StringBuffer();
|
|
||||||
String line = bReader.readLine();
|
|
||||||
while (line != null) {
|
|
||||||
prolog.append(line);
|
|
||||||
line = bReader.readLine();
|
|
||||||
}
|
|
||||||
final Matcher m = ENCODING_PATTERN.matcher(prolog);
|
final Matcher m = ENCODING_PATTERN.matcher(prolog);
|
||||||
if (m.find()) {
|
if (m.find()) {
|
||||||
encoding = m.group(1).toUpperCase(Locale.ENGLISH);
|
encoding = m.group(1).toUpperCase(Locale.ENGLISH);
|
||||||
|
|
|
@ -14,7 +14,17 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
package com.rometools.rome.unittest;
|
|
||||||
|
package com.rometools.rome.io;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertNull;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import static org.junit.Assert.fail;
|
||||||
|
|
||||||
|
import org.junit.Rule;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.rules.ExpectedException;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
|
@ -22,27 +32,27 @@ import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStreamWriter;
|
import java.io.OutputStreamWriter;
|
||||||
import java.io.Writer;
|
import java.io.Writer;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.text.MessageFormat;
|
import java.text.MessageFormat;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
|
||||||
|
|
||||||
import com.rometools.rome.io.XmlReader;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author pat, tucu
|
* @author pat, tucu
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public class TestXmlReader extends TestCase {
|
public class XmlReaderTest {
|
||||||
|
|
||||||
private static final String XML5 = "xml-prolog-encoding-spaced-single-quotes";
|
private static final String XML5 = "xml-prolog-encoding-spaced-single-quotes";
|
||||||
private static final String XML4 = "xml-prolog-encoding-single-quotes";
|
private static final String XML4 = "xml-prolog-encoding-single-quotes";
|
||||||
private static final String XML3 = "xml-prolog-encoding-double-quotes";
|
private static final String XML3 = "xml-prolog-encoding-double-quotes";
|
||||||
private static final String XML2 = "xml-prolog";
|
private static final String XML2 = "xml-prolog";
|
||||||
private static final String XML1 = "xml";
|
private static final String XML1 = "xml";
|
||||||
|
|
||||||
|
@Rule
|
||||||
|
public ExpectedException expectedException = ExpectedException.none();
|
||||||
|
|
||||||
public static void main(final String[] args) throws Exception {
|
public static void main(final String[] args) throws Exception {
|
||||||
final TestXmlReader test = new TestXmlReader();
|
final XmlReaderTest test = new XmlReaderTest();
|
||||||
test.testRawBom();
|
test.testRawBom();
|
||||||
test.testRawNoBom();
|
test.testRawNoBom();
|
||||||
test.testHttp();
|
test.testHttp();
|
||||||
|
@ -92,10 +102,12 @@ public class TestXmlReader extends TestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
public void testRawNoBom() throws Exception {
|
public void testRawNoBom() throws Exception {
|
||||||
testRawNoBomValid("US-ASCII");
|
testRawNoBomValid("US-ASCII");
|
||||||
testRawNoBomValid("UTF-8");
|
testRawNoBomValid("UTF-8");
|
||||||
testRawNoBomValid("ISO-8859-1");
|
testRawNoBomValid("ISO-8859-1");
|
||||||
|
testRawNoBomValid("CP1047");
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void testRawBomValid(final String encoding) throws Exception {
|
protected void testRawBomValid(final String encoding) throws Exception {
|
||||||
|
@ -120,6 +132,7 @@ public class TestXmlReader extends TestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
public void testRawBom() throws Exception {
|
public void testRawBom() throws Exception {
|
||||||
testRawBomValid("UTF-8");
|
testRawBomValid("UTF-8");
|
||||||
testRawBomValid("UTF-16BE");
|
testRawBomValid("UTF-16BE");
|
||||||
|
@ -136,6 +149,7 @@ public class TestXmlReader extends TestCase {
|
||||||
testRawBomInvalid("UTF-16LE-bom", "UTF-16LE", "UTF-8");
|
testRawBomInvalid("UTF-16LE-bom", "UTF-16LE", "UTF-8");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
public void testHttp() throws Exception {
|
public void testHttp() throws Exception {
|
||||||
testHttpValid("application/xml", "no-bom", "US-ASCII", null);
|
testHttpValid("application/xml", "no-bom", "US-ASCII", null);
|
||||||
testHttpValid("application/xml", "UTF-8-bom", "US-ASCII", null);
|
testHttpValid("application/xml", "UTF-8-bom", "US-ASCII", null);
|
||||||
|
@ -271,6 +285,7 @@ public class TestXmlReader extends TestCase {
|
||||||
private static final String ENCODING_ATTRIBUTE_XML = "<?xml version=\"1.0\" ?> \n" + "<atom:feed xmlns:atom=\"http://www.w3.org/2005/Atom\">\n" + "\n"
|
private static final String ENCODING_ATTRIBUTE_XML = "<?xml version=\"1.0\" ?> \n" + "<atom:feed xmlns:atom=\"http://www.w3.org/2005/Atom\">\n" + "\n"
|
||||||
+ " <atom:entry>\n" + " <atom:title encoding=\"base64\"><![CDATA\n" + "aW5nTGluZSIgLz4";
|
+ " <atom:entry>\n" + " <atom:title encoding=\"base64\"><![CDATA\n" + "aW5nTGluZSIgLz4";
|
||||||
|
|
||||||
|
@Test
|
||||||
public void testEncodingAttributeXML() throws Exception {
|
public void testEncodingAttributeXML() throws Exception {
|
||||||
final InputStream is = new ByteArrayInputStream(ENCODING_ATTRIBUTE_XML.getBytes());
|
final InputStream is = new ByteArrayInputStream(ENCODING_ATTRIBUTE_XML.getBytes());
|
||||||
final XmlReader xmlReader = new XmlReader(is, "", true);
|
final XmlReader xmlReader = new XmlReader(is, "", true);
|
||||||
|
@ -348,4 +363,120 @@ public class TestXmlReader extends TestCase {
|
||||||
return new ByteArrayInputStream(baos.toByteArray());
|
return new ByteArrayInputStream(baos.toByteArray());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetXmlProlog() throws IOException {
|
||||||
|
final InputStream input = stringToStream("<?xml encoding=\"TEST\"?>", "UTF-8");
|
||||||
|
final String guessedEncoding = "UTF-8";
|
||||||
|
|
||||||
|
final String prologEncoding = XmlReader.getXmlProlog(input, guessedEncoding);
|
||||||
|
|
||||||
|
assertEquals("TEST", prologEncoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetXmlProlog_Utf8() throws IOException {
|
||||||
|
final InputStream input = stringToStream("<?xml encoding=\"UTF-8\"?>", "UTF-8");
|
||||||
|
final String guessedEncoding = "UTF-8";
|
||||||
|
|
||||||
|
final String prologEncoding = XmlReader.getXmlProlog(input, guessedEncoding);
|
||||||
|
|
||||||
|
assertEquals("UTF-8", prologEncoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetXmlProlog_Utf16() throws IOException {
|
||||||
|
final InputStream input = stringToStream("<?xml encoding=\"UTF-16\"?>", "UTF-16");
|
||||||
|
final String guessedEncoding = "UTF-16";
|
||||||
|
|
||||||
|
assertEquals("UTF-16", XmlReader.getXmlProlog(input, guessedEncoding));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetXmlProlog_Cp1047() throws IOException {
|
||||||
|
final InputStream input = stringToStream("<?xml encoding=\"CP1047\"?>", "CP1047");
|
||||||
|
final String guessedEncoding = "CP1047";
|
||||||
|
|
||||||
|
assertEquals("CP1047", XmlReader.getXmlProlog(input, guessedEncoding));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetXmlProlog_NoEncoding() throws IOException {
|
||||||
|
final InputStream input = stringToStream("<?xml>", "UTF-8");
|
||||||
|
final String guessedEncoding = "UTF-8";
|
||||||
|
|
||||||
|
assertNull(XmlReader.getXmlProlog(input, guessedEncoding));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetXmlProlog_GuessedIsNull() throws IOException {
|
||||||
|
final InputStream input = stringToStream("<?xml encoding=\"UTF-8\"?>", "UTF-8");
|
||||||
|
final String guessedEncoding = null;
|
||||||
|
|
||||||
|
assertNull(XmlReader.getXmlProlog(input, guessedEncoding));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetXmlProlog_UppercaseResult() throws IOException {
|
||||||
|
final InputStream input = stringToStream("<?xml encoding=\"utf-8\"?>", "UTF-8");
|
||||||
|
final String guessedEncoding = "UTF-8";
|
||||||
|
|
||||||
|
assertEquals("UTF-8", XmlReader.getXmlProlog(input, guessedEncoding));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetXmlProlog_DifferentAsciiCompatible() throws IOException {
|
||||||
|
final InputStream input = stringToStream("<?xml encoding=\"TEST\"?>", "ISO-8859-1");
|
||||||
|
final String guessedEncoding = "UTF-8";
|
||||||
|
|
||||||
|
assertEquals("TEST", XmlReader.getXmlProlog(input, guessedEncoding));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetXmlProlog_DifferentAsciiIncompatible() throws IOException {
|
||||||
|
final InputStream input = stringToStream("<?xml encoding=\"TEST\"?>", "UTF-16BE");
|
||||||
|
final String guessedEncoding = "UTF-16LE";
|
||||||
|
|
||||||
|
expectedException.expect(IOException.class);
|
||||||
|
|
||||||
|
XmlReader.getXmlProlog(input, guessedEncoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetXmlProlog_NoClosingAngleBracket() throws IOException {
|
||||||
|
final InputStream input = stringToStream("<?xml encoding=\"TEST\"", "UTF-8");
|
||||||
|
final String guessedEncoding = "UTF-8";
|
||||||
|
|
||||||
|
expectedException.expect(IOException.class);
|
||||||
|
|
||||||
|
XmlReader.getXmlProlog(input, guessedEncoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetXmlProlog_Empty() throws IOException {
|
||||||
|
final InputStream input = stringToStream("", "UTF-8");
|
||||||
|
final String guessedEncoding = "UTF-8";
|
||||||
|
|
||||||
|
expectedException.expect(IOException.class);
|
||||||
|
|
||||||
|
XmlReader.getXmlProlog(input, guessedEncoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetXmlProlog_ClosingAngleBracketIsTooFar() throws IOException {
|
||||||
|
final StringBuilder spaces = new StringBuilder();
|
||||||
|
for (int i = 0; i < 5000; i++) {
|
||||||
|
spaces.append(" ");
|
||||||
|
}
|
||||||
|
|
||||||
|
final InputStream input = stringToStream("<?xml encoding=\"TEST\"?" + spaces + ">", "UTF-8");
|
||||||
|
final String guessedEncoding = "UTF-8";
|
||||||
|
|
||||||
|
expectedException.expect(IOException.class);
|
||||||
|
|
||||||
|
XmlReader.getXmlProlog(input, guessedEncoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
static InputStream stringToStream(String string, String encoding) {
|
||||||
|
return new ByteArrayInputStream(string.getBytes(Charset.forName(encoding)));
|
||||||
|
}
|
||||||
}
|
}
|
Loading…
Reference in a new issue