1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package com.sun.syndication.io;
18
19 import java.io.*;
20 import java.net.URL;
21 import java.net.URLConnection;
22 import java.net.HttpURLConnection;
23 import java.util.regex.Pattern;
24 import java.util.regex.Matcher;
25 import java.text.MessageFormat;
26
27 /***
28 * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out
29 * the charset encoding of the XML document within the stream.
30 * <p>
31 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
32 * character stream.
33 * <p>
34 * All this has to be done without consuming characters from the stream, if not the XML parser
35 * will not recognized the document as a valid XML. This is not 100% true, but it's close enough
36 * (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all
37 * parsers).
38 * <p>
39 * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and
40 * HTTP streams by offering a wide set of constructors.
41 * <P>
42 * By default the charset encoding detection is lenient, the constructor with the lenient flag
43 * can be used for an script (following HTTP MIME and XML specifications).
44 * All this is nicely explained by Mark Pilgrim in his blog,
45 * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
46 * Determining the character encoding of a feed</a>.
47 * <p>
48 * @author Alejandro Abdelnur
49 *
50 */
51 public class XmlReader extends Reader {
52 private static final int PUSHBACK_MAX_SIZE = 1024;
53
54 private static final String UTF_8 = "UTF-8";
55 private static final String US_ASCII = "US-ASCII";
56 private static final String UTF_16BE = "UTF-16BE";
57 private static final String UTF_16LE = "UTF-16LE";
58 private static final String UTF_16 = "UTF-16";
59
60 private Reader _reader;
61 private String _encoding;
62
63 /***
64 * Creates a Reader for a File.
65 * <p>
66 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
67 * missing defaults to UTF-8.
68 * <p>
69 * It does a lenient charset encoding detection, check the constructor with the lenient parameter
70 * for details.
71 * <p>
72 * @param file File to create a Reader from.
73 * @throws IOException thrown if there is a problem reading the file.
74 *
75 */
76 public XmlReader(File file) throws IOException {
77 this(new FileInputStream(file));
78 }
79
80 /***
81 * Creates a Reader for a raw InputStream.
82 * <p>
83 * It follows the same logic used for files.
84 * <p>
85 * It does a lenient charset encoding detection, check the constructor with the lenient parameter
86 * for details.
87 * <p>
88 * @param is InputStream to create a Reader from.
89 * @throws IOException thrown if there is a problem reading the stream.
90 *
91 */
92 public XmlReader(InputStream is) throws IOException {
93 this(is,true);
94 }
95
96 /***
97 * Creates a Reader for a raw InputStream.
98 * <p>
99 * It follows the same logic used for files.
100 * <p>
101 * If lenient detection is indicated and the detection above fails as per specifications it then attempts
102 * the following:
103 * <p>
104 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
105 * <p>
106 * Else if the XML prolog had a charset encoding that encoding is used.
107 * <p>
108 * Else if the content type had a charset encoding that encoding is used.
109 * <p>
110 * Else 'UTF-8' is used.
111 * <p>
112 * If lenient detection is indicated an XmlReaderException is never thrown.
113 * <p>
114 * @param is InputStream to create a Reader from.
115 * @param lenient indicates if the charset encoding detection should be relaxed.
116 * @throws IOException thrown if there is a problem reading the stream.
117 * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
118 *
119 */
120 public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException {
121 try {
122 doRawStream(is);
123 }
124 catch (XmlReaderException ex) {
125 if (!lenient) {
126 throw ex;
127 }
128 else {
129 doLenientDetection(null,ex);
130 }
131 }
132 }
133
134 /***
135 * Creates a Reader using the InputStream of a URL.
136 * <p>
137 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched
138 * data it uses the same logic used for Files.
139 * <p>
140 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched
141 * data it uses the same logic used for an InputStream with content-type.
142 * <p>
143 * It does a lenient charset encoding detection, check the constructor with the lenient parameter
144 * for details.
145 * <p>
146 * @param url URL to create a Reader from.
147 * @throws IOException thrown if there is a problem reading the stream of the URL.
148 *
149 */
150 public XmlReader(URL url) throws IOException {
151 this(url.openConnection());
152 }
153
154 /***
155 * Creates a Reader using the InputStream of a URLConnection.
156 * <p>
157 * If the URLConnection is not of type HttpURLConnection and there is not
158 * 'content-type' header in the fetched data it uses the same logic used for files.
159 * <p>
160 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched
161 * data it uses the same logic used for an InputStream with content-type.
162 * <p>
163 * It does a lenient charset encoding detection, check the constructor with the lenient parameter
164 * for details.
165 * <p>
166 * @param conn URLConnection to create a Reader from.
167 * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
168 *
169 */
170 public XmlReader(URLConnection conn) throws IOException {
171 if (conn instanceof HttpURLConnection) {
172 try {
173 doHttpStream(conn.getInputStream(),conn.getContentType());
174 }
175 catch (XmlReaderException ex) {
176 doLenientDetection(conn.getContentType(),ex);
177 }
178 }
179 else
180 if (conn.getContentType()!=null) {
181 try {
182 doHttpStream(conn.getInputStream(),conn.getContentType());
183 }
184 catch (XmlReaderException ex) {
185 doLenientDetection(conn.getContentType(),ex);
186 }
187 }
188 else {
189 try {
190 doRawStream(conn.getInputStream());
191 }
192 catch (XmlReaderException ex) {
193 doLenientDetection(null,ex);
194 }
195 }
196 }
197
198 /***
199 * Creates a Reader using an InputStream an the associated content-type header.
200 * <p>
201 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
202 * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
203 * prolog encoding uses the default encoding mandated by the content-type MIME type.
204 * <p>
205 * It does a lenient charset encoding detection, check the constructor with the lenient parameter
206 * for details.
207 * <p>
208 * @param is InputStream to create the reader from.
209 * @param httpContentType content-type header to use for the resolution of the charset encoding.
210 * @throws IOException thrown if there is a problem reading the file.
211 *
212 */
213 public XmlReader(InputStream is,String httpContentType) throws IOException {
214 this(is,httpContentType,true);
215 }
216
217 /***
218 * Creates a Reader using an InputStream an the associated content-type header. This constructor is
219 * lenient regarding the encoding detection.
220 * <p>
221 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
222 * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
223 * prolog encoding uses the default encoding mandated by the content-type MIME type.
224 * <p>
225 * If lenient detection is indicated and the detection above fails as per specifications it then attempts
226 * the following:
227 * <p>
228 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
229 * <p>
230 * Else if the XML prolog had a charset encoding that encoding is used.
231 * <p>
232 * Else if the content type had a charset encoding that encoding is used.
233 * <p>
234 * Else 'UTF-8' is used.
235 * <p>
236 * If lenient detection is indicated an XmlReaderException is never thrown.
237 * <p>
238 * @param is InputStream to create the reader from.
239 * @param httpContentType content-type header to use for the resolution of the charset encoding.
240 * @param lenient indicates if the charset encoding detection should be relaxed.
241 * @throws IOException thrown if there is a problem reading the file.
242 * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
243 *
244 */
245 public XmlReader(InputStream is,String httpContentType,boolean lenient) throws IOException, XmlReaderException {
246 try {
247 doHttpStream(is,httpContentType);
248 }
249 catch (XmlReaderException ex) {
250 if (!lenient) {
251 throw ex;
252 }
253 else {
254 doLenientDetection(httpContentType,ex);
255 }
256 }
257 }
258
259 private void doLenientDetection(String httpContentType,XmlReaderException ex) throws IOException {
260 if (httpContentType!=null) {
261 if (httpContentType.startsWith("text/html")) {
262 httpContentType = httpContentType.substring("text/html".length());
263 httpContentType = "text/xml" + httpContentType;
264 try {
265 doHttpStream(ex.getInputStream(),httpContentType);
266 ex = null;
267 }
268 catch (XmlReaderException ex2) {
269 ex = ex2;
270 }
271 }
272 }
273 if (ex!=null) {
274 String encoding = ex.getXmlEncoding();
275 if (encoding==null) {
276 encoding = ex.getContentTypeEncoding();
277 }
278 if (encoding==null) {
279 encoding = UTF_8;
280 }
281 prepareReader(ex.getInputStream(),encoding);
282 }
283 }
284
285 /***
286 * Returns the charset encoding of the XmlReader.
287 * <p>
288 * @return charset encoding.
289 *
290 */
291 public String getEncoding() {
292 return _encoding;
293 }
294
295 public int read(char[] buf,int offset,int len) throws IOException {
296 return _reader.read(buf,offset,len);
297 }
298
299 /***
300 * Closes the XmlReader stream.
301 * <p>
302 * @throws IOException thrown if there was a problem closing the stream.
303 *
304 */
305 public void close() throws IOException {
306 _reader.close();
307 }
308
309 private void doRawStream(InputStream is) throws IOException {
310 PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
311 String bomEnc = getBOMEncoding(pis);
312 String xmlGuessEnc = getXMLGuessEncoding(pis);
313 String xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc);
314 String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
315 prepareReader(pis,encoding);
316 }
317
318 private void doHttpStream(InputStream is,String httpContentType) throws IOException {
319 PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
320 String cTMime = getContentTypeMime(httpContentType);
321 String cTEnc = getContentTypeEncoding(httpContentType);
322 String bomEnc = getBOMEncoding(pis);
323 String xmlGuessEnc = getXMLGuessEncoding(pis);
324 String xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc);
325 String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis);
326 prepareReader(pis,encoding);
327 }
328
329 private void prepareReader(InputStream is,String encoding) throws IOException {
330 _reader = new InputStreamReader(is,encoding);
331 _encoding = encoding;
332 }
333
334
335 private static String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException {
336 String encoding;
337 if (bomEnc==null) {
338 if (xmlGuessEnc==null || xmlEnc==null) {
339 encoding = UTF_8;
340 }
341 else
342 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
343 encoding = xmlGuessEnc;
344 }
345 else {
346 encoding = xmlEnc;
347 }
348 }
349 else
350 if (bomEnc.equals(UTF_8)) {
351 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
352 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
353 bomEnc,xmlGuessEnc,xmlEnc,is);
354 }
355 if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
356 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
357 bomEnc,xmlGuessEnc,xmlEnc,is);
358 }
359 encoding = UTF_8;
360 }
361 else
362 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
363 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
364 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
365 }
366 if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
367 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
368 bomEnc,xmlGuessEnc,xmlEnc,is);
369 }
370 encoding =bomEnc;
371 }
372 else {
373 throw new XmlReaderException(RAW_EX_2.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
374 bomEnc,xmlGuessEnc,xmlEnc,is);
375 }
376 return encoding;
377 }
378
379
380 private static String calculateHttpEncoding(String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException {
381 boolean appXml = isAppXml(cTMime);
382 boolean textXml = isTextXml(cTMime);
383 String encoding;
384 if (appXml || textXml) {
385 if (cTEnc==null) {
386 if (appXml) {
387 encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
388 }
389 else {
390 encoding = US_ASCII;
391 }
392 }
393 else
394 if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
395 throw new XmlReaderException(HTTP_EX_1.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
396 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
397 }
398 else
399 if (cTEnc.equals(UTF_16)) {
400 if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
401 encoding = bomEnc;
402 }
403 else {
404 throw new XmlReaderException(HTTP_EX_2.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
405 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
406 }
407 }
408 else {
409 encoding = cTEnc;
410 }
411 }
412 else {
413 throw new XmlReaderException(HTTP_EX_3.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
414 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
415 }
416 return encoding;
417 }
418
419
420 private static String getContentTypeMime(String httpContentType) {
421 String mime = null;
422 if (httpContentType!=null) {
423 int i = httpContentType.indexOf(";");
424 mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
425 }
426 return mime;
427 }
428
429 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
430
431
432 private static String getContentTypeEncoding(String httpContentType) {
433 String encoding = null;
434 if (httpContentType!=null) {
435 int i = httpContentType.indexOf(";");
436 if (i>-1) {
437 String postMime = httpContentType.substring(i+1);
438 Matcher m = CHARSET_PATTERN.matcher(postMime);
439 encoding = (m.find()) ? m.group(1) : null;
440 encoding = (encoding!=null) ? encoding.toUpperCase() : null;
441 }
442 }
443 return encoding;
444 }
445
446
447
448 private static String getBOMEncoding(PushbackInputStream is) throws IOException {
449 String encoding = null;
450 int[] bytes = new int[3];
451 bytes[0] = is.read();
452 bytes[1] = is.read();
453 bytes[2] = is.read();
454
455 if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
456 encoding = UTF_16BE;
457 is.unread(bytes[2]);
458 }
459 else
460 if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
461 encoding = UTF_16LE;
462 is.unread(bytes[2]);
463 }
464 else
465 if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
466 encoding = UTF_8;
467 }
468 else {
469 for (int i=bytes.length-1;i>=0;i--) {
470 is.unread(bytes[i]);
471 }
472 }
473 return encoding;
474 }
475
476
477 private static String getXMLGuessEncoding(PushbackInputStream is) throws IOException {
478 String encoding = null;
479 int[] bytes = new int[4];
480 bytes[0] = is.read();
481 bytes[1] = is.read();
482 bytes[2] = is.read();
483 bytes[3] = is.read();
484 for (int i=bytes.length-1;i>=0;i--) {
485 is.unread(bytes[i]);
486 }
487
488 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
489 encoding = UTF_16BE;
490 }
491 else
492 if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
493 encoding = UTF_16LE;
494 }
495 else
496 if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
497 encoding = UTF_8;
498 }
499 return encoding;
500 }
501
502 private static final Pattern ENCODING_PATTERN = Pattern.compile("^<//?xml.*encoding=\"(.*)\".*//?>");
503
504
505 private static String getXMLPrologEncoding(PushbackInputStream is,String guessedEnc) throws IOException {
506 String encoding = null;
507 if (guessedEnc!=null) {
508 byte[] bytes = new byte[PUSHBACK_MAX_SIZE];
509 int offset = 0;
510 int max = PUSHBACK_MAX_SIZE;
511 int c = is.read(bytes,offset,max);
512 while (c!=-1 && offset<PUSHBACK_MAX_SIZE) {
513 offset += c;
514 max -= c;
515 c = is.read(bytes,offset,max);
516 }
517 int bytesRead = offset;
518 if (bytesRead>0) {
519 is.unread(bytes,0,bytesRead);
520 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,bytesRead), guessedEnc);
521 BufferedReader br = new BufferedReader(reader);
522 String prolog = br.readLine();
523 Matcher m = ENCODING_PATTERN.matcher(prolog);
524 encoding = (m.find()) ? m.group(1).toUpperCase() : null;
525 }
526 }
527 return encoding;
528 }
529
530
531 private static boolean isAppXml(String mime) {
532 return mime!=null &&
533 (mime.equals("application/xml") ||
534 mime.equals("application/xml-dtd") ||
535 mime.equals("application/xml-external-parsed-entity") ||
536 (mime.startsWith("application/") && mime.endsWith("+xml")));
537 }
538
539
540 private static boolean isTextXml(String mime) {
541 return mime!=null &&
542 (mime.equals("text/xml") ||
543 mime.equals("text/xml-external-parsed-entity") ||
544 (mime.startsWith("text/") && mime.endsWith("+xml")));
545 }
546
547 private static final MessageFormat RAW_EX_1 = new MessageFormat(
548 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
549
550 private static final MessageFormat RAW_EX_2 = new MessageFormat(
551 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
552
553 private static final MessageFormat HTTP_EX_1 = new MessageFormat(
554 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
555
556 private static final MessageFormat HTTP_EX_2 = new MessageFormat(
557 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
558
559 private static final MessageFormat HTTP_EX_3 = new MessageFormat(
560 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
561
562 }