View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.io;
18  
19  import java.io.*;
20  import java.net.URL;
21  import java.net.URLConnection;
22  import java.net.HttpURLConnection;
23  import java.util.regex.Pattern;
24  import java.util.regex.Matcher;
25  import java.text.MessageFormat;
26  
27  /***
28   * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out
29   * the charset encoding of the XML document within the stream.
30   * <p>
31   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
32   * character stream.
33   * <p>
34   * All this has to be done without consuming characters from the stream, if not the XML parser
35   * will not recognized the document as a valid XML. This is not 100% true, but it's close enough
36   * (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all
37   * parsers).
38   * <p>
39   * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and
40   * HTTP streams by offering a wide set of constructors.
41   * <P>
42   * By default the charset encoding detection is lenient, the constructor with the lenient flag
43   * can be used for an script (following HTTP MIME and XML specifications).
44   * All this is nicely explained by Mark Pilgrim in his blog,
45   * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
46   * Determining the character encoding of a feed</a>.
47   * <p>
48   * @author Alejandro Abdelnur
49   *
50   */
51  public class XmlReader extends Reader {
52      private static final int PUSHBACK_MAX_SIZE = 1024;
53  
54      private static final String UTF_8 = "UTF-8";
55      private static final String US_ASCII = "US-ASCII";
56      private static final String UTF_16BE = "UTF-16BE";
57      private static final String UTF_16LE = "UTF-16LE";
58      private static final String UTF_16 = "UTF-16";
59  
60      private Reader _reader;
61      private String _encoding;
62  
63      /***
64       * Creates a Reader for a File.
65       * <p>
66       * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
67       * missing defaults to UTF-8.
68       * <p>
69       * It does a lenient charset encoding detection, check the constructor with the lenient parameter
70       * for details.
71       * <p>
72       * @param file File to create a Reader from.
73       * @throws IOException thrown if there is a problem reading the file.
74       *
75       */
76      public XmlReader(File file) throws IOException {
77          this(new FileInputStream(file));
78      }
79  
80      /***
81       * Creates a Reader for a raw InputStream.
82       * <p>
83       * It follows the same logic used for files.
84       * <p>
85       * It does a lenient charset encoding detection, check the constructor with the lenient parameter
86       * for details.
87       * <p>
88       * @param is InputStream to create a Reader from.
89       * @throws IOException thrown if there is a problem reading the stream.
90       *
91       */
92      public XmlReader(InputStream is) throws IOException {
93          this(is,true);
94      }
95  
96      /***
97       * Creates a Reader for a raw InputStream.
98       * <p>
99       * It follows the same logic used for files.
100      * <p>
101      * If lenient detection is indicated and the detection above fails as per specifications it then attempts
102      * the following:
103      * <p>
104      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
105      * <p>
106      * Else if the XML prolog had a charset encoding that encoding is used.
107      * <p>
108      * Else if the content type had a charset encoding that encoding is used.
109      * <p>
110      * Else 'UTF-8' is used.
111      * <p>
112      * If lenient detection is indicated an XmlReaderException is never thrown.
113      * <p>
114      * @param is InputStream to create a Reader from.
115      * @param lenient indicates if the charset encoding detection should be relaxed.
116      * @throws IOException thrown if there is a problem reading the stream.
117      * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
118      *
119      */
120     public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException {
121         try {
122             doRawStream(is);
123         }
124         catch (XmlReaderException ex) {
125             if (!lenient) {
126                 throw ex;
127             }
128             else {
129                 doLenientDetection(null,ex);
130             }
131         }
132     }
133 
134     /***
135      * Creates a Reader using the InputStream of a URL.
136      * <p>
137      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched
138      * data it uses the same logic used for Files.
139      * <p>
140      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched
141      * data it uses the same logic used for an InputStream with content-type.
142      * <p>
143      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
144      * for details.
145      * <p>
146      * @param url URL to create a Reader from.
147      * @throws IOException thrown if there is a problem reading the stream of the URL.
148      *
149      */
150     public XmlReader(URL url) throws IOException {
151         this(url.openConnection());
152     }
153 
154     /***
155      * Creates a Reader using the InputStream of a URLConnection.
156      * <p>
157      * If the URLConnection is not of type HttpURLConnection and there is not
158      * 'content-type' header in the fetched data it uses the same logic used for files.
159      * <p>
160      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched
161      * data it uses the same logic used for an InputStream with content-type.
162      * <p>
163      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
164      * for details.
165      * <p>
166      * @param conn URLConnection to create a Reader from.
167      * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
168      *
169      */
170     public XmlReader(URLConnection conn) throws IOException {
171         if (conn instanceof HttpURLConnection) {
172             try {
173                 doHttpStream(conn.getInputStream(),conn.getContentType());
174             }
175             catch (XmlReaderException ex) {
176                 doLenientDetection(conn.getContentType(),ex);
177             }
178         }
179         else
180         if (conn.getContentType()!=null) {
181             try {
182                 doHttpStream(conn.getInputStream(),conn.getContentType());
183             }
184             catch (XmlReaderException ex) {
185                 doLenientDetection(conn.getContentType(),ex);
186             }
187         }
188         else {
189             try {
190                 doRawStream(conn.getInputStream());
191             }
192             catch (XmlReaderException ex) {
193                 doLenientDetection(null,ex);
194             }
195         }
196     }
197 
198     /***
199      * Creates a Reader using an InputStream an the associated content-type header.
200      * <p>
201      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
202      * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
203      * prolog encoding uses the default encoding mandated by the content-type MIME type.
204      * <p>
205      * It does a lenient charset encoding detection, check the constructor with the lenient parameter
206      * for details.
207      * <p>
208      * @param is InputStream to create the reader from.
209      * @param httpContentType content-type header to use for the resolution of the charset encoding.
210      * @throws IOException thrown if there is a problem reading the file.
211      *
212      */
213     public XmlReader(InputStream is,String httpContentType) throws IOException {
214         this(is,httpContentType,true);
215     }
216 
217     /***
218      * Creates a Reader using an InputStream an the associated content-type header. This constructor is
219      * lenient regarding the encoding detection.
220      * <p>
221      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
222      * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
223      * prolog encoding uses the default encoding mandated by the content-type MIME type.
224      * <p>
225      * If lenient detection is indicated and the detection above fails as per specifications it then attempts
226      * the following:
227      * <p>
228      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
229      * <p>
230      * Else if the XML prolog had a charset encoding that encoding is used.
231      * <p>
232      * Else if the content type had a charset encoding that encoding is used.
233      * <p>
234      * Else 'UTF-8' is used.
235      * <p>
236      * If lenient detection is indicated an XmlReaderException is never thrown.
237      * <p>
238      * @param is InputStream to create the reader from.
239      * @param httpContentType content-type header to use for the resolution of the charset encoding.
240      * @param lenient indicates if the charset encoding detection should be relaxed.
241      * @throws IOException thrown if there is a problem reading the file.
242      * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
243      *
244      */
245     public XmlReader(InputStream is,String httpContentType,boolean lenient) throws IOException, XmlReaderException {
246         try {
247             doHttpStream(is,httpContentType);
248         }
249         catch (XmlReaderException ex) {
250             if (!lenient) {
251                 throw ex;
252             }
253             else {
254                 doLenientDetection(httpContentType,ex);
255             }
256         }
257     }
258 
259     private void doLenientDetection(String httpContentType,XmlReaderException ex) throws IOException {
260         if (httpContentType!=null) {
261             if (httpContentType.startsWith("text/html")) {
262                 httpContentType = httpContentType.substring("text/html".length());
263                 httpContentType = "text/xml" + httpContentType;
264                 try {
265                     doHttpStream(ex.getInputStream(),httpContentType);
266                     ex = null;
267                 }
268                 catch (XmlReaderException ex2) {
269                     ex = ex2;
270                 }
271             }
272         }
273         if (ex!=null) {
274             String encoding = ex.getXmlEncoding();
275             if (encoding==null) {
276                 encoding = ex.getContentTypeEncoding();
277             }
278             if (encoding==null) {
279                 encoding = UTF_8;
280             }
281             prepareReader(ex.getInputStream(),encoding);
282         }
283     }
284 
285     /***
286      * Returns the charset encoding of the XmlReader.
287      * <p>
288      * @return charset encoding.
289      *
290      */
291     public String getEncoding() {
292         return _encoding;
293     }
294 
295     public int read(char[] buf,int offset,int len) throws IOException {
296         return _reader.read(buf,offset,len);
297     }
298 
299     /***
300      * Closes the XmlReader stream.
301      * <p>
302      * @throws IOException thrown if there was a problem closing the stream.
303      *
304      */
305     public void close() throws IOException {
306         _reader.close();
307     }
308 
309     private void doRawStream(InputStream is) throws IOException {
310         PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
311         String bomEnc = getBOMEncoding(pis);
312         String xmlGuessEnc =  getXMLGuessEncoding(pis);
313         String xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc);
314         String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
315         prepareReader(pis,encoding);
316     }
317 
318     private void doHttpStream(InputStream is,String httpContentType) throws IOException {
319         PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
320         String cTMime = getContentTypeMime(httpContentType);
321         String cTEnc  = getContentTypeEncoding(httpContentType);
322         String bomEnc = getBOMEncoding(pis);
323         String xmlGuessEnc =  getXMLGuessEncoding(pis);
324         String xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc);
325         String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis);
326         prepareReader(pis,encoding);
327     }
328 
329     private void prepareReader(InputStream is,String encoding) throws IOException {
330         _reader = new InputStreamReader(is,encoding);
331         _encoding = encoding;
332     }
333 
334     // InputStream is passed for XmlReaderException creation only
335     private static String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException {
336         String encoding;
337         if (bomEnc==null) {
338             if (xmlGuessEnc==null || xmlEnc==null) {
339                 encoding = UTF_8;
340             }
341             else
342             if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
343                 encoding = xmlGuessEnc;
344             }
345             else {
346                 encoding = xmlEnc;
347             }
348         }
349         else
350         if (bomEnc.equals(UTF_8)) {
351             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
352                 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
353                                              bomEnc,xmlGuessEnc,xmlEnc,is);
354             }
355             if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
356                 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
357                                              bomEnc,xmlGuessEnc,xmlEnc,is);
358             }
359             encoding = UTF_8;
360         }
361         else
362         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
363             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
364                 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
365             }
366             if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
367                 throw new XmlReaderException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
368                                              bomEnc,xmlGuessEnc,xmlEnc,is);
369             }
370             encoding =bomEnc;
371         }
372         else {
373             throw new XmlReaderException(RAW_EX_2.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}),
374                                          bomEnc,xmlGuessEnc,xmlEnc,is);
375         }
376         return encoding;
377     }
378 
379     // InputStream is passed for XmlReaderException creation only
380     private static String calculateHttpEncoding(String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException {
381         boolean appXml = isAppXml(cTMime);
382         boolean textXml = isTextXml(cTMime);
383         String encoding;
384         if (appXml || textXml) {
385             if (cTEnc==null) {
386                 if (appXml) {
387                     encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is);
388                 }
389                 else {
390                     encoding = US_ASCII;
391                 }
392             }
393             else
394             if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
395                 throw new XmlReaderException(HTTP_EX_1.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
396                                              cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
397             }
398             else
399             if (cTEnc.equals(UTF_16)) {
400                 if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
401                     encoding = bomEnc;
402                 }
403                 else {
404                     throw new XmlReaderException(HTTP_EX_2.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
405                                                  cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
406                 }
407             }
408             else {
409                 encoding = cTEnc;
410             }
411         }
412         else {
413             throw new XmlReaderException(HTTP_EX_3.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}),
414                                          cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is);
415         }
416         return encoding;
417     }
418 
419     // returns MIME type or NULL if httpContentType is NULL
420     private static String getContentTypeMime(String httpContentType) {
421         String mime = null;
422         if (httpContentType!=null) {
423             int i = httpContentType.indexOf(";");
424             mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
425         }
426         return mime;
427     }
428 
429     private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
430 
431     // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
432     private static String getContentTypeEncoding(String httpContentType) {
433         String encoding = null;
434         if (httpContentType!=null) {
435             int i = httpContentType.indexOf(";");
436             if (i>-1) {
437                 String postMime = httpContentType.substring(i+1);
438                 Matcher m = CHARSET_PATTERN.matcher(postMime);
439                 encoding = (m.find()) ? m.group(1) : null;
440                 encoding = (encoding!=null) ? encoding.toUpperCase() : null;
441             }
442         }
443         return encoding;
444     }
445 
446     // returns the BOM in the stream, NULL if not present,
447     // if there was BOM the in the stream it is consumed
448     private static String getBOMEncoding(PushbackInputStream is) throws IOException {
449         String encoding = null;
450         int[] bytes = new int[3];
451         bytes[0] = is.read();
452         bytes[1] = is.read();
453         bytes[2] = is.read();
454 
455         if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
456             encoding = UTF_16BE;
457             is.unread(bytes[2]);
458         }
459         else
460         if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
461             encoding = UTF_16LE;
462             is.unread(bytes[2]);
463         }
464         else
465         if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
466             encoding = UTF_8;
467         }
468         else {
469             for (int i=bytes.length-1;i>=0;i--) {
470                 is.unread(bytes[i]);
471             }
472         }
473         return encoding;
474     }
475 
476     // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
477     private static String getXMLGuessEncoding(PushbackInputStream is) throws IOException {
478         String encoding = null;
479         int[] bytes = new int[4];
480         bytes[0] = is.read();
481         bytes[1] = is.read();
482         bytes[2] = is.read();
483         bytes[3] = is.read();
484         for (int i=bytes.length-1;i>=0;i--) {
485             is.unread(bytes[i]);
486         }
487 
488         if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
489                 encoding = UTF_16BE;
490         }
491         else
492         if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
493                 encoding = UTF_16LE;
494         }
495         else
496         if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
497             encoding = UTF_8;
498         }
499         return encoding;
500     }
501 
502     private static final Pattern ENCODING_PATTERN = Pattern.compile("^<//?xml.*encoding=\"(.*)\".*//?>");
503 
504     // returns the encoding declared in the <?xml encoding=...?>,  NULL if none
505     private static String getXMLPrologEncoding(PushbackInputStream is,String guessedEnc) throws IOException {
506         String encoding = null;
507         if (guessedEnc!=null) {
508             byte[] bytes = new byte[PUSHBACK_MAX_SIZE];
509             int offset = 0;
510             int max = PUSHBACK_MAX_SIZE;
511             int c = is.read(bytes,offset,max);
512             while (c!=-1 && offset<PUSHBACK_MAX_SIZE) {
513                 offset += c;
514                 max -= c;
515                 c = is.read(bytes,offset,max);
516             }
517             int bytesRead = offset;
518             if (bytesRead>0) {
519                 is.unread(bytes,0,bytesRead);
520                 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,bytesRead), guessedEnc);
521                 BufferedReader br = new BufferedReader(reader);
522                 String prolog = br.readLine();
523                 Matcher m = ENCODING_PATTERN.matcher(prolog);
524                 encoding = (m.find()) ? m.group(1).toUpperCase() : null;
525             }
526         }
527         return encoding;
528     }
529 
530     // indicates if the MIME type belongs to the APPLICATION XML family
531     private static boolean isAppXml(String mime) {
532         return mime!=null &&
533                (mime.equals("application/xml") ||
534                 mime.equals("application/xml-dtd") ||
535                 mime.equals("application/xml-external-parsed-entity") ||
536                 (mime.startsWith("application/") && mime.endsWith("+xml")));
537     }
538 
539     // indicates if the MIME type belongs to the TEXT XML family
540     private static boolean isTextXml(String mime) {
541         return mime!=null &&
542                (mime.equals("text/xml") ||
543                 mime.equals("text/xml-external-parsed-entity") ||
544                 (mime.startsWith("text/") && mime.endsWith("+xml")));
545     }
546 
547     private static final MessageFormat RAW_EX_1 = new MessageFormat(
548             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
549 
550     private static final MessageFormat RAW_EX_2 = new MessageFormat(
551             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
552 
553     private static final MessageFormat HTTP_EX_1 = new MessageFormat(
554             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
555 
556     private static final MessageFormat HTTP_EX_2 = new MessageFormat(
557             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
558 
559     private static final MessageFormat HTTP_EX_3 = new MessageFormat(
560             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
561 
562 }