1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package com.sun.syndication.io;
18
19 import java.io.*;
20 import java.net.URL;
21 import java.net.URLConnection;
22 import java.net.HttpURLConnection;
23 import java.util.regex.Pattern;
24 import java.util.regex.Matcher;
25 import java.text.MessageFormat;
26
27 /***
28 * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out
29 * the charset encoding of the XML document within the stream.
30 * <p>
31 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
32 * character stream.
33 * <p>
34 * All this has to be done without consuming characters from the stream, if not the XML parser
35 * will not recognized the document as a valid XML. This is not 100% true, but it's close enough
36 * (UTF-8 BOM is not handled by all parsers right now, XmlReader handles and things work in all
37 * parsers).
38 * <p>
39 * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and
40 * HTTP streams by offering a wide set of constructors.
41 * <P>
42 * There are also some convenience static methods to find out charset encodings following the
43 * rules defined by HTTP, MIME types and XML specifications. All this is nicely explained by
44 * Mark Pilgrim in his blog, <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
45 * Determining the character encoding of a feed</a>.
46 * <p>
47 * @author Alejandro Abdelnur
48 *
49 */
50 public class XmlReader extends Reader {
51 private static final int PUSHBACK_MAX_SIZE = 1024;
52
53 private static final String UTF_8 = "UTF-8";
54 private static final String US_ASCII = "US-ASCII";
55 private static final String UTF_16BE = "UTF-16BE";
56 private static final String UTF_16LE = "UTF-16LE";
57 private static final String UTF_16 = "UTF-16";
58
59 private Reader _reader;
60 private String _encoding;
61
62 /***
63 * Creates a Reader for a File.
64 * <p>
65 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
66 * missing defaults to UTF-8.
67 * <p>
68 * @param file File to create a Reader from.
69 * @throws IOException thrown if there is a problem reading the file.
70 *
71 */
72 public XmlReader(File file) throws IOException {
73 this(new FileInputStream(file));
74 }
75
76 /***
77 * Creates a Reader for a raw InputStream.
78 * <p>
79 * It follows the same logic used for files.
80 * <p>
81 * @param is InputStream to create a Reader from.
82 * @throws IOException thrown if there is a problem reading the stream.
83 *
84 */
85 public XmlReader(InputStream is) throws IOException {
86 doRawStream(is);
87 }
88
89 /***
90 * Creates a Reader using the InputStream of a URL.
91 * <p>
92 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched
93 * data it uses the same logic used for Files.
94 * <p>
95 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched
96 * data it uses the same logic used for an InputStream with content-type.
97 * <p>
98 * @param url URL to create a Reader from.
99 * @throws IOException thrown if there is a problem reading the stream of the URL.
100 *
101 */
102 public XmlReader(URL url) throws IOException {
103 this(url.openConnection());
104 }
105
106 /***
107 * Creates a Reader using the InputStream of a URLConnection.
108 * <p>
109 * If the URLConnection is not of type HttpURLConnection and there is not
110 * 'content-type' header in the fetched data it uses the same logic used for files.
111 * <p>
112 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched
113 * data it uses the same logic used for an InputStream with content-type.
114 * <p>
115 * @param conn URLConnection to create a Reader from.
116 * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
117 *
118 */
119 public XmlReader(URLConnection conn) throws IOException {
120 if (conn instanceof HttpURLConnection) {
121 doHttpStream(conn.getInputStream(),conn.getContentType());
122 }
123 else
124 if (conn.getContentType()!=null) {
125 doHttpStream(conn.getInputStream(),conn.getContentType());
126 }
127 else {
128 doRawStream(conn.getInputStream());
129 }
130 }
131
132 /***
133 * Creates a Reader using an InputStream an the associated content-type header.
134 * <p>
135 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
136 * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
137 * prolog encoding uses the default encoding mandated by the content-type MIME type.
138 * <p>
139 * @param is InputStream to create the reader from.
140 * @param httpContentType content-type header to use for the resolution of the charset encoding.
141 * @throws IOException thrown if there is a problem reading the file.
142 *
143 */
144 public XmlReader(InputStream is,String httpContentType) throws IOException {
145 doHttpStream(is,httpContentType);
146 }
147
148 /***
149 * Returns the charset encoding of the XmlReader.
150 * <p>
151 * @return charset encoding.
152 *
153 */
154 public String getEncoding() {
155 return _encoding;
156 }
157
158 public int read(char[] buf,int offset,int len) throws IOException {
159 return _reader.read(buf,offset,len);
160 }
161
162 /***
163 * Closes the XmlReader stream.
164 * <p>
165 * @throws IOException thrown if there was a problem closing the stream.
166 *
167 */
168 public void close() throws IOException {
169 _reader.close();
170 }
171
172 private void doRawStream(InputStream is) throws IOException {
173 PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
174 String bomEnc = getBOMEncoding(pis);
175 String xmlGuessEnc = getXMLGuessEncoding(pis);
176 String xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc);
177 _encoding = calculateRawEncoding(bomEnc,xmlGuessEnc,xmlEnc);
178 _reader = new InputStreamReader(pis,_encoding);
179 }
180
181 private void doHttpStream(InputStream is,String httpContentType) throws IOException {
182 PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
183 String cTMime = getContentTypeMime(httpContentType);
184 String cTEnc = getContentTypeEncoding(httpContentType);
185 String bomEnc = getBOMEncoding(pis);
186 String xmlGuessEnc = getXMLGuessEncoding(pis);
187 String xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc);
188 _encoding = calculateHttpEncoding(cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc);
189 _reader = new InputStreamReader(pis,_encoding);
190 }
191
192 private static String calculateRawEncoding(String bomEnc,String xmlGuessEnc,String xmlEnc) throws IOException {
193 String encoding;
194 if (bomEnc==null) {
195 if (xmlGuessEnc==null || xmlEnc==null) {
196 encoding = UTF_8;
197 }
198 else
199 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
200 encoding = xmlGuessEnc;
201 }
202 else {
203 encoding = xmlEnc;
204 }
205 }
206 else
207 if (bomEnc.equals(UTF_8)) {
208 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
209 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
210 }
211 if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
212 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
213 }
214 encoding = UTF_8;
215 }
216 else
217 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
218 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
219 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
220 }
221 if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
222 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
223 }
224 encoding =bomEnc;
225 }
226 else {
227 throw new IOException(RAW_EX_2.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
228 }
229 return encoding;
230 }
231
232 private static String calculateHttpEncoding(String cTMime,String cTEnc,String bomEnc,String xmlGuessEnc,String xmlEnc) throws IOException {
233 boolean appXml = isAppXml(cTMime);
234 boolean textXml = isTextXml(cTMime);
235 String encoding;
236 if (appXml || textXml) {
237 if (cTEnc==null) {
238 if (appXml) {
239 encoding = calculateRawEncoding(bomEnc,xmlGuessEnc,xmlEnc);
240 }
241 else {
242 encoding = US_ASCII;
243 }
244 }
245 else
246 if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
247 throw new IOException(HTTP_EX_1.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}));
248 }
249 else
250 if (cTEnc.equals(UTF_16)) {
251 if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
252 encoding = bomEnc;
253 }
254 else {
255 throw new IOException(HTTP_EX_2.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}));
256 }
257 }
258 else {
259 encoding = cTEnc;
260 }
261 }
262 else {
263 throw new IOException(HTTP_EX_3.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}));
264 }
265 return encoding;
266 }
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312 private static String getContentTypeMime(String httpContentType) {
313 String mime = null;
314 if (httpContentType!=null) {
315 int i = httpContentType.indexOf(";");
316 mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
317 }
318 return mime;
319 }
320
321 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
322
323
324 private static String getContentTypeEncoding(String httpContentType) {
325 String encoding = null;
326 if (httpContentType!=null) {
327 int i = httpContentType.indexOf(";");
328 if (i>-1) {
329 String postMime = httpContentType.substring(i+1);
330 Matcher m = CHARSET_PATTERN.matcher(postMime);
331 encoding = (m.find()) ? m.group(1) : null;
332 encoding = (encoding!=null) ? encoding.toUpperCase() : null;
333 }
334 }
335 return encoding;
336 }
337
338
339
340 private static String getBOMEncoding(PushbackInputStream is) throws IOException {
341 String encoding = null;
342 int[] bytes = new int[3];
343 bytes[0] = is.read();
344 bytes[1] = is.read();
345 bytes[2] = is.read();
346
347 if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
348 encoding = UTF_16BE;
349 is.unread(bytes[2]);
350 }
351 else
352 if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
353 encoding = UTF_16LE;
354 is.unread(bytes[2]);
355 }
356 else
357 if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
358 encoding = UTF_8;
359 }
360 else {
361 for (int i=bytes.length-1;i>=0;i--) {
362 is.unread(bytes[i]);
363 }
364 }
365 return encoding;
366 }
367
368
369 private static String getXMLGuessEncoding(PushbackInputStream is) throws IOException {
370 String encoding = null;
371 int[] bytes = new int[4];
372 bytes[0] = is.read();
373 bytes[1] = is.read();
374 bytes[2] = is.read();
375 bytes[3] = is.read();
376 for (int i=bytes.length-1;i>=0;i--) {
377 is.unread(bytes[i]);
378 }
379
380 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
381 encoding = UTF_16BE;
382 }
383 else
384 if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
385 encoding = UTF_16LE;
386 }
387 else
388 if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
389 encoding = UTF_8;
390 }
391 return encoding;
392 }
393
394 private static final Pattern ENCODING_PATTERN = Pattern.compile("^<//?xml.*encoding=\"(.*)\".*//?>");
395
396
397 private static String getXMLPrologEncoding(PushbackInputStream is,String guessedEnc) throws IOException {
398 String encoding = null;
399 if (guessedEnc!=null) {
400 byte[] bytes = new byte[PUSHBACK_MAX_SIZE];
401 int bytesRead = is.read(bytes);
402 if (bytesRead>-1) {
403 is.unread(bytes,0,bytesRead);
404 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,bytesRead), guessedEnc);
405 BufferedReader br = new BufferedReader(reader);
406 String prolog = br.readLine();
407 Matcher m = ENCODING_PATTERN.matcher(prolog);
408 encoding = (m.find()) ? m.group(1).toUpperCase() : null;
409 }
410 }
411 return encoding;
412 }
413
414
415 private static boolean isAppXml(String mime) {
416 return mime!=null &&
417 (mime.equals("application/xml") ||
418 mime.equals("application/xml-dtd") ||
419 mime.equals("application/xml-external-parsed-entity") ||
420 (mime.startsWith("application/") && mime.endsWith("+xml")));
421 }
422
423
424 private static boolean isTextXml(String mime) {
425 return mime!=null &&
426 (mime.equals("text/xml") ||
427 mime.equals("text/xml-external-parsed-entity") ||
428 (mime.startsWith("text/") && mime.endsWith("+xml")));
429 }
430
431 private static final MessageFormat RAW_EX_1 = new MessageFormat(
432 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
433
434 private static final MessageFormat RAW_EX_2 = new MessageFormat(
435 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
436
437 private static final MessageFormat HTTP_EX_1 = new MessageFormat(
438 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
439
440 private static final MessageFormat HTTP_EX_2 = new MessageFormat(
441 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
442
443 private static final MessageFormat HTTP_EX_3 = new MessageFormat(
444 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
445
446 }