Support CP1047 encoding

This commit is contained in:
mishako 2015-12-25 17:46:07 +01:00
parent b0f3908956
commit aed739c7ca
2 changed files with 146 additions and 19 deletions

View file

@ -66,6 +66,7 @@ public class XmlReader extends Reader {
private static final String UTF_16BE = "UTF-16BE";
private static final String UTF_16LE = "UTF-16LE";
private static final String UTF_16 = "UTF-16";
private static final String CP1047 = "CP1047";
private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
private static final Pattern ENCODING_PATTERN = Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE);
private static final MessageFormat RAW_EX_1 = new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
@ -621,7 +622,7 @@ public class XmlReader extends Reader {
}
// returns the best guess for the encoding by looking the first bytes of the
// stream, '<?'
// stream, '<?xm'
private static String getXMLGuessEncoding(final BufferedInputStream is) throws IOException {
String encoding = null;
final int[] bytes = new int[4];
@ -638,12 +639,14 @@ public class XmlReader extends Reader {
encoding = UTF_16LE;
} else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
encoding = UTF_8;
} else if (bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7 && bytes[3] == 0x94) {
encoding = CP1047;
}
return encoding;
}
// returns the encoding declared in the <?xml encoding=...?>, NULL if none
private static String getXmlProlog(final BufferedInputStream is, final String guessedEnc) throws IOException {
static String getXmlProlog(final InputStream is, final String guessedEnc) throws IOException {
String encoding = null;
if (guessedEnc != null) {
final byte[] bytes = new byte[BUFFER_SIZE];
@ -656,7 +659,7 @@ public class XmlReader extends Reader {
offset += c;
max -= c;
c = is.read(bytes, offset, max);
firstGT = new String(bytes, 0, offset).indexOf(">");
firstGT = new String(bytes, 0, offset, guessedEnc).indexOf(">");
}
if (firstGT == -1) {
if (c == -1) {
@ -668,14 +671,7 @@ public class XmlReader extends Reader {
final int bytesRead = offset;
if (bytesRead > 0) {
is.reset();
final Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes, 0, firstGT + 1), guessedEnc);
final BufferedReader bReader = new BufferedReader(reader);
final StringBuffer prolog = new StringBuffer();
String line = bReader.readLine();
while (line != null) {
prolog.append(line);
line = bReader.readLine();
}
String prolog = new String(bytes, guessedEnc).substring(0, firstGT);
final Matcher m = ENCODING_PATTERN.matcher(prolog);
if (m.find()) {
encoding = m.group(1).toUpperCase(Locale.ENGLISH);

View file

@ -14,7 +14,17 @@
* limitations under the License.
*
*/
package com.rometools.rome.unittest;
package com.rometools.rome.io;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
@ -22,27 +32,27 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.text.MessageFormat;
import java.util.HashMap;
import java.util.Map;
import junit.framework.TestCase;
import com.rometools.rome.io.XmlReader;
/**
* @author pat, tucu
*
*/
public class TestXmlReader extends TestCase {
public class XmlReaderTest {
private static final String XML5 = "xml-prolog-encoding-spaced-single-quotes";
private static final String XML4 = "xml-prolog-encoding-single-quotes";
private static final String XML3 = "xml-prolog-encoding-double-quotes";
private static final String XML2 = "xml-prolog";
private static final String XML1 = "xml";
@Rule
public ExpectedException expectedException = ExpectedException.none();
public static void main(final String[] args) throws Exception {
final TestXmlReader test = new TestXmlReader();
final XmlReaderTest test = new XmlReaderTest();
test.testRawBom();
test.testRawNoBom();
test.testHttp();
@ -92,10 +102,12 @@ public class TestXmlReader extends TestCase {
}
}
@Test
public void testRawNoBom() throws Exception {
testRawNoBomValid("US-ASCII");
testRawNoBomValid("UTF-8");
testRawNoBomValid("ISO-8859-1");
testRawNoBomValid("CP1047");
}
protected void testRawBomValid(final String encoding) throws Exception {
@ -120,6 +132,7 @@ public class TestXmlReader extends TestCase {
}
}
@Test
public void testRawBom() throws Exception {
testRawBomValid("UTF-8");
testRawBomValid("UTF-16BE");
@ -136,6 +149,7 @@ public class TestXmlReader extends TestCase {
testRawBomInvalid("UTF-16LE-bom", "UTF-16LE", "UTF-8");
}
@Test
public void testHttp() throws Exception {
testHttpValid("application/xml", "no-bom", "US-ASCII", null);
testHttpValid("application/xml", "UTF-8-bom", "US-ASCII", null);
@ -271,6 +285,7 @@ public class TestXmlReader extends TestCase {
private static final String ENCODING_ATTRIBUTE_XML = "<?xml version=\"1.0\" ?> \n" + "<atom:feed xmlns:atom=\"http://www.w3.org/2005/Atom\">\n" + "\n"
+ " <atom:entry>\n" + " <atom:title encoding=\"base64\"><![CDATA\n" + "aW5nTGluZSIgLz4";
@Test
public void testEncodingAttributeXML() throws Exception {
final InputStream is = new ByteArrayInputStream(ENCODING_ATTRIBUTE_XML.getBytes());
final XmlReader xmlReader = new XmlReader(is, "", true);
@ -348,4 +363,120 @@ public class TestXmlReader extends TestCase {
return new ByteArrayInputStream(baos.toByteArray());
}
@Test
public void testGetXmlProlog() throws IOException {
final InputStream input = stringToStream("<?xml encoding=\"TEST\"?>", "UTF-8");
final String guessedEncoding = "UTF-8";
final String prologEncoding = XmlReader.getXmlProlog(input, guessedEncoding);
assertEquals("TEST", prologEncoding);
}
@Test
public void testGetXmlProlog_Utf8() throws IOException {
final InputStream input = stringToStream("<?xml encoding=\"UTF-8\"?>", "UTF-8");
final String guessedEncoding = "UTF-8";
final String prologEncoding = XmlReader.getXmlProlog(input, guessedEncoding);
assertEquals("UTF-8", prologEncoding);
}
@Test
public void testGetXmlProlog_Utf16() throws IOException {
final InputStream input = stringToStream("<?xml encoding=\"UTF-16\"?>", "UTF-16");
final String guessedEncoding = "UTF-16";
assertEquals("UTF-16", XmlReader.getXmlProlog(input, guessedEncoding));
}
@Test
public void testGetXmlProlog_Cp1047() throws IOException {
final InputStream input = stringToStream("<?xml encoding=\"CP1047\"?>", "CP1047");
final String guessedEncoding = "CP1047";
assertEquals("CP1047", XmlReader.getXmlProlog(input, guessedEncoding));
}
@Test
public void testGetXmlProlog_NoEncoding() throws IOException {
final InputStream input = stringToStream("<?xml>", "UTF-8");
final String guessedEncoding = "UTF-8";
assertNull(XmlReader.getXmlProlog(input, guessedEncoding));
}
@Test
public void testGetXmlProlog_GuessedIsNull() throws IOException {
final InputStream input = stringToStream("<?xml encoding=\"UTF-8\"?>", "UTF-8");
final String guessedEncoding = null;
assertNull(XmlReader.getXmlProlog(input, guessedEncoding));
}
@Test
public void testGetXmlProlog_UppercaseResult() throws IOException {
final InputStream input = stringToStream("<?xml encoding=\"utf-8\"?>", "UTF-8");
final String guessedEncoding = "UTF-8";
assertEquals("UTF-8", XmlReader.getXmlProlog(input, guessedEncoding));
}
@Test
public void testGetXmlProlog_DifferentAsciiCompatible() throws IOException {
final InputStream input = stringToStream("<?xml encoding=\"TEST\"?>", "ISO-8859-1");
final String guessedEncoding = "UTF-8";
assertEquals("TEST", XmlReader.getXmlProlog(input, guessedEncoding));
}
@Test
public void testGetXmlProlog_DifferentAsciiIncompatible() throws IOException {
final InputStream input = stringToStream("<?xml encoding=\"TEST\"?>", "UTF-16BE");
final String guessedEncoding = "UTF-16LE";
expectedException.expect(IOException.class);
XmlReader.getXmlProlog(input, guessedEncoding);
}
@Test
public void testGetXmlProlog_NoClosingAngleBracket() throws IOException {
final InputStream input = stringToStream("<?xml encoding=\"TEST\"", "UTF-8");
final String guessedEncoding = "UTF-8";
expectedException.expect(IOException.class);
XmlReader.getXmlProlog(input, guessedEncoding);
}
@Test
public void testGetXmlProlog_Empty() throws IOException {
final InputStream input = stringToStream("", "UTF-8");
final String guessedEncoding = "UTF-8";
expectedException.expect(IOException.class);
XmlReader.getXmlProlog(input, guessedEncoding);
}
@Test
public void testGetXmlProlog_ClosingAngleBracketIsTooFar() throws IOException {
final StringBuilder spaces = new StringBuilder();
for (int i = 0; i < 5000; i++) {
spaces.append(" ");
}
final InputStream input = stringToStream("<?xml encoding=\"TEST\"?" + spaces + ">", "UTF-8");
final String guessedEncoding = "UTF-8";
expectedException.expect(IOException.class);
XmlReader.getXmlProlog(input, guessedEncoding);
}
static InputStream stringToStream(String string, String encoding) {
return new ByteArrayInputStream(string.getBytes(Charset.forName(encoding)));
}
}