Support CP1047 encoding
This commit is contained in:
parent
b0f3908956
commit
aed739c7ca
2 changed files with 146 additions and 19 deletions
|
@ -66,6 +66,7 @@ public class XmlReader extends Reader {
|
|||
private static final String UTF_16BE = "UTF-16BE";
|
||||
private static final String UTF_16LE = "UTF-16LE";
|
||||
private static final String UTF_16 = "UTF-16";
|
||||
private static final String CP1047 = "CP1047";
|
||||
private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
|
||||
private static final Pattern ENCODING_PATTERN = Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE);
|
||||
private static final MessageFormat RAW_EX_1 = new MessageFormat("Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
|
||||
|
@ -621,7 +622,7 @@ public class XmlReader extends Reader {
|
|||
}
|
||||
|
||||
// returns the best guess for the encoding by looking the first bytes of the
|
||||
// stream, '<?'
|
||||
// stream, '<?xm'
|
||||
private static String getXMLGuessEncoding(final BufferedInputStream is) throws IOException {
|
||||
String encoding = null;
|
||||
final int[] bytes = new int[4];
|
||||
|
@ -638,12 +639,14 @@ public class XmlReader extends Reader {
|
|||
encoding = UTF_16LE;
|
||||
} else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
|
||||
encoding = UTF_8;
|
||||
} else if (bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7 && bytes[3] == 0x94) {
|
||||
encoding = CP1047;
|
||||
}
|
||||
return encoding;
|
||||
}
|
||||
|
||||
// returns the encoding declared in the <?xml encoding=...?>, NULL if none
|
||||
private static String getXmlProlog(final BufferedInputStream is, final String guessedEnc) throws IOException {
|
||||
static String getXmlProlog(final InputStream is, final String guessedEnc) throws IOException {
|
||||
String encoding = null;
|
||||
if (guessedEnc != null) {
|
||||
final byte[] bytes = new byte[BUFFER_SIZE];
|
||||
|
@ -656,7 +659,7 @@ public class XmlReader extends Reader {
|
|||
offset += c;
|
||||
max -= c;
|
||||
c = is.read(bytes, offset, max);
|
||||
firstGT = new String(bytes, 0, offset).indexOf(">");
|
||||
firstGT = new String(bytes, 0, offset, guessedEnc).indexOf(">");
|
||||
}
|
||||
if (firstGT == -1) {
|
||||
if (c == -1) {
|
||||
|
@ -668,14 +671,7 @@ public class XmlReader extends Reader {
|
|||
final int bytesRead = offset;
|
||||
if (bytesRead > 0) {
|
||||
is.reset();
|
||||
final Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes, 0, firstGT + 1), guessedEnc);
|
||||
final BufferedReader bReader = new BufferedReader(reader);
|
||||
final StringBuffer prolog = new StringBuffer();
|
||||
String line = bReader.readLine();
|
||||
while (line != null) {
|
||||
prolog.append(line);
|
||||
line = bReader.readLine();
|
||||
}
|
||||
String prolog = new String(bytes, guessedEnc).substring(0, firstGT);
|
||||
final Matcher m = ENCODING_PATTERN.matcher(prolog);
|
||||
if (m.find()) {
|
||||
encoding = m.group(1).toUpperCase(Locale.ENGLISH);
|
||||
|
|
|
@ -14,7 +14,17 @@
|
|||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
package com.rometools.rome.unittest;
|
||||
|
||||
package com.rometools.rome.io;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.ExpectedException;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
|
@ -22,27 +32,27 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.MessageFormat;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import com.rometools.rome.io.XmlReader;
|
||||
|
||||
/**
|
||||
* @author pat, tucu
|
||||
*
|
||||
*/
|
||||
public class TestXmlReader extends TestCase {
|
||||
public class XmlReaderTest {
|
||||
|
||||
private static final String XML5 = "xml-prolog-encoding-spaced-single-quotes";
|
||||
private static final String XML4 = "xml-prolog-encoding-single-quotes";
|
||||
private static final String XML3 = "xml-prolog-encoding-double-quotes";
|
||||
private static final String XML2 = "xml-prolog";
|
||||
private static final String XML1 = "xml";
|
||||
|
||||
@Rule
|
||||
public ExpectedException expectedException = ExpectedException.none();
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final TestXmlReader test = new TestXmlReader();
|
||||
final XmlReaderTest test = new XmlReaderTest();
|
||||
test.testRawBom();
|
||||
test.testRawNoBom();
|
||||
test.testHttp();
|
||||
|
@ -92,10 +102,12 @@ public class TestXmlReader extends TestCase {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRawNoBom() throws Exception {
|
||||
testRawNoBomValid("US-ASCII");
|
||||
testRawNoBomValid("UTF-8");
|
||||
testRawNoBomValid("ISO-8859-1");
|
||||
testRawNoBomValid("CP1047");
|
||||
}
|
||||
|
||||
protected void testRawBomValid(final String encoding) throws Exception {
|
||||
|
@ -120,6 +132,7 @@ public class TestXmlReader extends TestCase {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRawBom() throws Exception {
|
||||
testRawBomValid("UTF-8");
|
||||
testRawBomValid("UTF-16BE");
|
||||
|
@ -136,6 +149,7 @@ public class TestXmlReader extends TestCase {
|
|||
testRawBomInvalid("UTF-16LE-bom", "UTF-16LE", "UTF-8");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHttp() throws Exception {
|
||||
testHttpValid("application/xml", "no-bom", "US-ASCII", null);
|
||||
testHttpValid("application/xml", "UTF-8-bom", "US-ASCII", null);
|
||||
|
@ -271,6 +285,7 @@ public class TestXmlReader extends TestCase {
|
|||
private static final String ENCODING_ATTRIBUTE_XML = "<?xml version=\"1.0\" ?> \n" + "<atom:feed xmlns:atom=\"http://www.w3.org/2005/Atom\">\n" + "\n"
|
||||
+ " <atom:entry>\n" + " <atom:title encoding=\"base64\"><![CDATA\n" + "aW5nTGluZSIgLz4";
|
||||
|
||||
@Test
|
||||
public void testEncodingAttributeXML() throws Exception {
|
||||
final InputStream is = new ByteArrayInputStream(ENCODING_ATTRIBUTE_XML.getBytes());
|
||||
final XmlReader xmlReader = new XmlReader(is, "", true);
|
||||
|
@ -348,4 +363,120 @@ public class TestXmlReader extends TestCase {
|
|||
return new ByteArrayInputStream(baos.toByteArray());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetXmlProlog() throws IOException {
|
||||
final InputStream input = stringToStream("<?xml encoding=\"TEST\"?>", "UTF-8");
|
||||
final String guessedEncoding = "UTF-8";
|
||||
|
||||
final String prologEncoding = XmlReader.getXmlProlog(input, guessedEncoding);
|
||||
|
||||
assertEquals("TEST", prologEncoding);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetXmlProlog_Utf8() throws IOException {
|
||||
final InputStream input = stringToStream("<?xml encoding=\"UTF-8\"?>", "UTF-8");
|
||||
final String guessedEncoding = "UTF-8";
|
||||
|
||||
final String prologEncoding = XmlReader.getXmlProlog(input, guessedEncoding);
|
||||
|
||||
assertEquals("UTF-8", prologEncoding);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetXmlProlog_Utf16() throws IOException {
|
||||
final InputStream input = stringToStream("<?xml encoding=\"UTF-16\"?>", "UTF-16");
|
||||
final String guessedEncoding = "UTF-16";
|
||||
|
||||
assertEquals("UTF-16", XmlReader.getXmlProlog(input, guessedEncoding));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetXmlProlog_Cp1047() throws IOException {
|
||||
final InputStream input = stringToStream("<?xml encoding=\"CP1047\"?>", "CP1047");
|
||||
final String guessedEncoding = "CP1047";
|
||||
|
||||
assertEquals("CP1047", XmlReader.getXmlProlog(input, guessedEncoding));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetXmlProlog_NoEncoding() throws IOException {
|
||||
final InputStream input = stringToStream("<?xml>", "UTF-8");
|
||||
final String guessedEncoding = "UTF-8";
|
||||
|
||||
assertNull(XmlReader.getXmlProlog(input, guessedEncoding));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetXmlProlog_GuessedIsNull() throws IOException {
|
||||
final InputStream input = stringToStream("<?xml encoding=\"UTF-8\"?>", "UTF-8");
|
||||
final String guessedEncoding = null;
|
||||
|
||||
assertNull(XmlReader.getXmlProlog(input, guessedEncoding));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetXmlProlog_UppercaseResult() throws IOException {
|
||||
final InputStream input = stringToStream("<?xml encoding=\"utf-8\"?>", "UTF-8");
|
||||
final String guessedEncoding = "UTF-8";
|
||||
|
||||
assertEquals("UTF-8", XmlReader.getXmlProlog(input, guessedEncoding));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetXmlProlog_DifferentAsciiCompatible() throws IOException {
|
||||
final InputStream input = stringToStream("<?xml encoding=\"TEST\"?>", "ISO-8859-1");
|
||||
final String guessedEncoding = "UTF-8";
|
||||
|
||||
assertEquals("TEST", XmlReader.getXmlProlog(input, guessedEncoding));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetXmlProlog_DifferentAsciiIncompatible() throws IOException {
|
||||
final InputStream input = stringToStream("<?xml encoding=\"TEST\"?>", "UTF-16BE");
|
||||
final String guessedEncoding = "UTF-16LE";
|
||||
|
||||
expectedException.expect(IOException.class);
|
||||
|
||||
XmlReader.getXmlProlog(input, guessedEncoding);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetXmlProlog_NoClosingAngleBracket() throws IOException {
|
||||
final InputStream input = stringToStream("<?xml encoding=\"TEST\"", "UTF-8");
|
||||
final String guessedEncoding = "UTF-8";
|
||||
|
||||
expectedException.expect(IOException.class);
|
||||
|
||||
XmlReader.getXmlProlog(input, guessedEncoding);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetXmlProlog_Empty() throws IOException {
|
||||
final InputStream input = stringToStream("", "UTF-8");
|
||||
final String guessedEncoding = "UTF-8";
|
||||
|
||||
expectedException.expect(IOException.class);
|
||||
|
||||
XmlReader.getXmlProlog(input, guessedEncoding);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetXmlProlog_ClosingAngleBracketIsTooFar() throws IOException {
|
||||
final StringBuilder spaces = new StringBuilder();
|
||||
for (int i = 0; i < 5000; i++) {
|
||||
spaces.append(" ");
|
||||
}
|
||||
|
||||
final InputStream input = stringToStream("<?xml encoding=\"TEST\"?" + spaces + ">", "UTF-8");
|
||||
final String guessedEncoding = "UTF-8";
|
||||
|
||||
expectedException.expect(IOException.class);
|
||||
|
||||
XmlReader.getXmlProlog(input, guessedEncoding);
|
||||
}
|
||||
|
||||
static InputStream stringToStream(String string, String encoding) {
|
||||
return new ByteArrayInputStream(string.getBytes(Charset.forName(encoding)));
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue