1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package com.sun.syndication.io;
18
19 import java.io.*;
20 import java.net.URL;
21 import java.net.URLConnection;
22 import java.net.HttpURLConnection;
23 import java.util.regex.Pattern;
24 import java.util.regex.Matcher;
25 import java.text.MessageFormat;
26
27 /***
28 * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out
29 * the charset encoding of the XML document within the stream.
30 * <p>
31 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
32 * character stream.
33 * <p>
34 * All this has to be done without consuming characters from the stream, if not the XML parser
35 * will not recognized the document as a valid XML. This is not 100% true, but it's close enough
36 * (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all
37 * parsers).
38 * <p>
39 * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and
40 * HTTP streams by offering a wide set of constructors.
41 * <P>
42 * By default the charset encoding detection is lenient, the constructor with the lenient flag
43 * can be used for an script (following HTTP MIME and XML specifications).
44 * All this is nicely explained by Mark Pilgrim in his blog,
45 * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
46 * Determining the character encoding of a feed</a>.
47 * <p>
48 * @author Alejandro Abdelnur
49 *
50 */
51 public class XmlReader extends Reader {
52 private static final int PUSHBACK_MAX_SIZE = 4096;
53
54 private static final String UTF_8 = "UTF-8";
55 private static final String US_ASCII = "US-ASCII";
56 private static final String UTF_16BE = "UTF-16BE";
57 private static final String UTF_16LE = "UTF-16LE";
58 private static final String UTF_16 = "UTF-16";
59
60 private Reader _reader;
61 private String _encoding;
62
63 /***
64 * Creates a Reader for a File.
65 * <p>
66 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
67 * missing defaults to UTF-8.
68 * <p>
69 * It does a lenient charset encoding detection, check the constructor with the lenient parameter
70 * for details.
71 * <p>
72 * @param file File to create a Reader from.
73 * @throws IOException thrown if there is a problem reading the file.
74 *
75 */
76 public XmlReader(File file) throws IOException {
77 this(new FileInputStream(file));
78 }
79
80 /***
81 * Creates a Reader for a raw InputStream.
82 * <p>
83 * It follows the same logic used for files.
84 * <p>
85 * It does a lenient charset encoding detection, check the constructor with the lenient parameter
86 * for details.
87 * <p>
88 * @param is InputStream to create a Reader from.
89 * @throws IOException thrown if there is a problem reading the stream.
90 *
91 */
92 public XmlReader(InputStream is) throws IOException {
93 this(is,true);
94 }
95
96 /***
97 * Creates a Reader for a raw InputStream.
98 * <p>
99 * It follows the same logic used for files.
100 * <p>
101 * If lenient detection is indicated and the detection above fails as per specifications it then attempts
102 * the following:
103 * <p>
104 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
105 * <p>
106 * Else if the XML prolog had a charset encoding that encoding is used.
107 * <p>
108 * Else if the content type had a charset encoding that encoding is used.
109 * <p>
110 * Else 'UTF-8' is used.
111 * <p>
112 * If lenient detection is indicated an XmlReaderException is never thrown.
113 * <p>
114 * @param is InputStream to create a Reader from.
115 * @param lenient indicates if the charset encoding detection should be relaxed.
116 * @throws IOException thrown if there is a problem reading the stream.
117 * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
118 *
119 */
120 public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException {
121 try {
122 doRawStream(is,lenient);
123 }
124 catch (XmlReaderException ex) {
125 if (!lenient) {
126 throw ex;
127 }
128 else {
129 doLenientDetection(null,ex);
130 }
131 }
132 }
133
134 /***
135 * Creates a Reader using the InputStream of a URL.
136 * <p>
137 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched
138 * data it uses the same logic used for Files.
139 * <p>
140 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched
141 * data it uses the same logic used for an InputStream with content-type.
142 * <p>
143 * It does a lenient charset encoding detection, check the constructor with the lenient parameter
144 * for details.
145 * <p>
146 * @param url URL to create a Reader from.
147 * @throws IOException thrown if there is a problem reading the stream of the URL.
148 *
149 */
150 public XmlReader(URL url) throws IOException {
151 this(url.openConnection());
152 }
153
154 /***
155 * Creates a Reader using the InputStream of a URLConnection.
156 * <p>
157 * If the URLConnection is not of type HttpURLConnection and there is not
158 * 'content-type' header in the fetched data it uses the same logic used for files.
159 * <p>
160 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched
161 * data it uses the same logic used for an InputStream with content-type.
162 * <p>
163 * It does a lenient charset encoding detection, check the constructor with the lenient parameter
164 * for details.
165 * <p>
166 * @param conn URLConnection to create a Reader from.
167 * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
168 *
169 */
170 public XmlReader(URLConnection conn) throws IOException {
171 boolean lenient = true;
172 if (conn instanceof HttpURLConnection) {
173 try {
174 doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
175 }
176 catch (XmlReaderException ex) {
177 doLenientDetection(conn.getContentType(),ex);
178 }
179 }
180 else
181 if (conn.getContentType()!=null) {
182 try {
183 doHttpStream(conn.getInputStream(),conn.getContentType(),lenient);
184 }
185 catch (XmlReaderException ex) {
186 doLenientDetection(conn.getContentType(),ex);
187 }
188 }
189 else {
190 try {
191 doRawStream(conn.getInputStream(),lenient);
192 }
193 catch (XmlReaderException ex) {
194 doLenientDetection(null,ex);
195 }
196 }
197 }
198
199 /***
200 * Creates a Reader using an InputStream an the associated content-type header.
201 * <p>
202 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
203 * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
204 * prolog encoding uses the default encoding mandated by the content-type MIME type.
205 * <p>
206 * It does a lenient charset encoding detection, check the constructor with the lenient parameter
207 * for details.
208 * <p>
209 * @param is InputStream to create the reader from.
210 * @param httpContentType content-type header to use for the resolution of the charset encoding.
211 * @throws IOException thrown if there is a problem reading the file.
212 *
213 */
214 public XmlReader(InputStream is,String httpContentType) throws IOException {
215 this(is,httpContentType,true);
216 }
217
218 /***
219 * Creates a Reader using an InputStream an the associated content-type header. This constructor is
220 * lenient regarding the encoding detection.
221 * <p>
222 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
223 * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
224 * prolog encoding uses the default encoding mandated by the content-type MIME type.
225 * <p>
226 * If lenient detection is indicated and the detection above fails as per specifications it then attempts
227 * the following:
228 * <p>
229 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
230 * <p>
231 * Else if the XML prolog had a charset encoding that encoding is used.
232 * <p>
233 * Else if the content type had a charset encoding that encoding is used.
234 * <p>
235 * Else 'UTF-8' is used.
236 * <p>
237 * If lenient detection is indicated an XmlReaderException is never thrown.
238 * <p>
239 * @param is InputStream to create the reader from.
240 * @param httpContentType content-type header to use for the resolution of the charset encoding.
241 * @param lenient indicates if the charset encoding detection should be relaxed.
242 * @throws IOException thrown if there is a problem reading the file.
243 * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
244 *
245 */
246 public XmlReader(InputStream is,String httpContentType,boolean lenient) throws IOException, XmlReaderException {
247 try {
248 doHttpStream(is,httpContentType,lenient);
249 }
250 catch (XmlReaderException ex) {
251 if (!lenient) {
252 throw ex;
253 }
254 else {
255 doLenientDetection(httpContentType,ex);
256 }
257 }
258 }
259
260 private void doLenientDetection(String httpContentType,XmlReaderException ex) throws IOException {
261 if (httpContentType!=null) {
262 if (httpContentType.startsWith("text/html")) {
263 httpContentType = httpContentType.substring("text/html".length());
264 httpContentType = "text/xml" + httpContentType;
265 try {
266 doHttpStream(ex.getInputStream(),httpContentType,true);
267 ex = null;
268 }
269 catch (XmlReaderException ex2) {
270 ex = ex2;
271 }
272 }
273 }
274 if (ex!=null) {
275 String encoding = ex.getXmlEncoding();
276 if (encoding==null) {
277 encoding = ex.getContentTypeEncoding();
278 }
279 if (encoding==null) {
280 encoding = UTF_8;
281 }
282 prepareReader(ex.getInputStream(),encoding);
283 }
284 }
285
286 /***
287 * Returns the charset encoding of the XmlReader.
288 * <p>
289 * @return charset encoding.
290 *
291 */
292 public String getEncoding() {
293 return _encoding;
294 }
295
296 public int read(char[] buf,int offset,int len) throws IOException {
297 return _reader.read(buf,offset,len);
298 }
299
300 /***
301 * Closes the XmlReader stream.
302 * <p>
303 * @throws IOException thrown if there was a problem closing the stream.
304 *
305 */
306 public void close() throws IOException {
307 _reader.close();
308 }
309
310 private void doRawStream(InputStream is,boolean lenient) throws IOException {
311 PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
312 String bomEnc = getBOMEncoding(pis);
313 String xmlGuessEnc = getXMLGuessEncoding(pis);
314 String xmlEnc = getXmlProlog(pis,xmlGuessEnc);
315 String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
316 prepareReader(pis,encoding);
317 }
318
319 private void doHttpStream(InputStream is,String httpContentType,boolean lenient) throws IOException {
320 PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
321 String cTMime = getContentTypeMime(httpContentType);
322 String cTEnc = getContentTypeEncoding(httpContentType);
323 String bomEnc = getBOMEncoding(pis);
324 String xmlGuessEnc = getXMLGuessEncoding(pis);
325 String xmlEnc = getXmlProlog(pis,xmlGuessEnc);
326 String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis,lenient);
327 prepareReader(pis,encoding);
328 }
329
330 private void prepareReader(InputStream is,String encoding) throws IOException {
331 _reader = new InputStreamReader(is,encoding);
332 _encoding = encoding;
333 }
334
335
336 private static String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException {
337 String encoding;
338 if (bomEnc==null) {
339 if (xmlGuessEnc==null || xmlEnc==null) {
340 encoding = UTF_8;
341 }
342 else
343 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
344 encoding = xmlGuessEnc;
345 }
346 else {
347 encoding = xmlEnc;
348 }
349 }
350 else
351 if (bomEnc.equals(UTF_8)) {
352 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
353 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
354 bomEnc,xmlGuessEnc,xmlEnc,is);
355 }
356 if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
357 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
358 bomEnc,xmlGuessEnc,xmlEnc,is);
359 }
360 encoding = UTF_8;
361 }
362 else
363 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
364 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
365 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
366 }
367 if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
368 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
369 bomEnc,xmlGuessEnc,xmlEnc,is);
370 }
371 encoding =bomEnc;
372 }
373 else {
374 throw new XmlReaderException(RAW_EX_2.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
375 bomEnc,xmlGuessEnc,xmlEnc,is);
376 }
377 return encoding;
378 }
379
380
381 private static String calculateHttpEncoding(String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is,boolean lenient) throws IOException {
382 String encoding;
383 if (lenient & xmlEnc!=null) {
384 encoding = xmlEnc;
385 }
386 else {
387 boolean appXml = isAppXml(cTMime);
388 boolean textXml = isTextXml(cTMime);
389 if (appXml || textXml) {
390 if (cTEnc==null) {
391 if (appXml) {
392 encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
393 }
394 else {
395 encoding = US_ASCII;
396 }
397 }
398 else
399 if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
400 throw new XmlReaderException(HTTP_EX_1.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
401 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
402 }
403 else
404 if (cTEnc.equals(UTF_16)) {
405 if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
406 encoding = bomEnc;
407 }
408 else {
409 throw new XmlReaderException(HTTP_EX_2.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
410 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
411 }
412 }
413 else {
414 encoding = cTEnc;
415 }
416 }
417 else {
418 throw new XmlReaderException(HTTP_EX_3.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
419 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
420 }
421 }
422 return encoding;
423 }
424
425
426 private static String getContentTypeMime(String httpContentType) {
427 String mime = null;
428 if (httpContentType!=null) {
429 int i = httpContentType.indexOf(";");
430 mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
431 }
432 return mime;
433 }
434
435 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
436
437
438 private static String getContentTypeEncoding(String httpContentType) {
439 String encoding = null;
440 if (httpContentType!=null) {
441 int i = httpContentType.indexOf(";");
442 if (i>-1) {
443 String postMime = httpContentType.substring(i+1);
444 Matcher m = CHARSET_PATTERN.matcher(postMime);
445 encoding = (m.find()) ? m.group(1) : null;
446 encoding = (encoding!=null) ? encoding.toUpperCase() : null;
447 }
448 }
449 return encoding;
450 }
451
452
453
454 private static String getBOMEncoding(PushbackInputStream is) throws IOException {
455 String encoding = null;
456 int[] bytes = new int[3];
457 bytes[0] = is.read();
458 bytes[1] = is.read();
459 bytes[2] = is.read();
460
461 if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
462 encoding = UTF_16BE;
463 is.unread(bytes[2]);
464 }
465 else
466 if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
467 encoding = UTF_16LE;
468 is.unread(bytes[2]);
469 }
470 else
471 if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
472 encoding = UTF_8;
473 }
474 else {
475 for (int i=bytes.length-1;i>=0;i--) {
476 is.unread(bytes[i]);
477 }
478 }
479 return encoding;
480 }
481
482
483 private static String getXMLGuessEncoding(PushbackInputStream is) throws IOException {
484 String encoding = null;
485 int[] bytes = new int[4];
486 bytes[0] = is.read();
487 bytes[1] = is.read();
488 bytes[2] = is.read();
489 bytes[3] = is.read();
490 for (int i=bytes.length-1;i>=0;i--) {
491 is.unread(bytes[i]);
492 }
493
494 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
495 encoding = UTF_16BE;
496 }
497 else
498 if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
499 encoding = UTF_16LE;
500 }
501 else
502 if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
503 encoding = UTF_8;
504 }
505 return encoding;
506 }
507
508
509 private static final Pattern ENCODING_PATTERN =
510 Pattern.compile("<//?xml.*encoding[//s]*=[//s]*((?:\".[^\"]*\")|(?:'.[^']*')).*//?>", Pattern.MULTILINE);
511
512
513 private static String getXmlProlog(PushbackInputStream is,String guessedEnc) throws IOException {
514 String encoding = null;
515 if (guessedEnc!=null) {
516 byte[] bytes = new byte[PUSHBACK_MAX_SIZE];
517 int offset = 0;
518 int max = PUSHBACK_MAX_SIZE;
519 int c = is.read(bytes,offset,max);
520 while (c!=-1 && offset<PUSHBACK_MAX_SIZE) {
521 offset += c;
522 max -= c;
523 c = is.read(bytes,offset,max);
524 }
525 int bytesRead = offset;
526 if (bytesRead>0) {
527 is.unread(bytes,0,bytesRead);
528 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,bytesRead), guessedEnc);
529 BufferedReader br = new BufferedReader(reader);
530 StringBuffer prolog = new StringBuffer(PUSHBACK_MAX_SIZE);
531 String line = br.readLine();
532 while (line != null) {
533 prolog.append(line).append("\n");
534 line = br.readLine();
535 }
536 Matcher m = ENCODING_PATTERN.matcher(prolog);
537 if (m.find()) {
538 encoding = m.group(1).toUpperCase();
539 encoding = encoding.substring(1,encoding.length()-1);
540 }
541 }
542 }
543 return encoding;
544 }
545
546
547 private static boolean isAppXml(String mime) {
548 return mime!=null &&
549 (mime.equals("application/xml") ||
550 mime.equals("application/xml-dtd") ||
551 mime.equals("application/xml-external-parsed-entity") ||
552 (mime.startsWith("application/") && mime.endsWith("+xml")));
553 }
554
555
556 private static boolean isTextXml(String mime) {
557 return mime!=null &&
558 (mime.equals("text/xml") ||
559 mime.equals("text/xml-external-parsed-entity") ||
560 (mime.startsWith("text/") && mime.endsWith("+xml")));
561 }
562
563 private static final MessageFormat RAW_EX_1 = new MessageFormat(
564 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
565
566 private static final MessageFormat RAW_EX_2 = new MessageFormat(
567 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
568
569 private static final MessageFormat HTTP_EX_1 = new MessageFormat(
570 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
571
572 private static final MessageFormat HTTP_EX_2 = new MessageFormat(
573 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
574
575 private static final MessageFormat HTTP_EX_3 = new MessageFormat(
576 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
577
578 }