View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.io;
18  
19  import java.io.*;
20  import java.net.URL;
21  import java.net.URLConnection;
22  import java.net.HttpURLConnection;
23  import java.util.regex.Pattern;
24  import java.util.regex.Matcher;
25  import java.text.MessageFormat;
26  
27  /***
28   * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out
29   * the charset encoding of the XML document within the stream.
30   * <p>
31   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
32   * character stream.
33   * <p>
34   * All this has to be done without consuming characters from the stream, if not the XML parser
35   * will not recognized the document as a valid XML. This is not 100% true, but it's close enough
36   * (UTF-8 BOM is not handled by all parsers right now, XmlReader handles and things work in all
37   * parsers).
38   * <p>
39   * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and
40   * HTTP streams by offering a wide set of constructors.
41   * <P>
42   * There are also some convenience static methods to find out charset encodings following the
43   * rules defined by HTTP, MIME types and XML specifications. All this is nicely explained by
44   * Mark Pilgrim in his blog, <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
45   * Determining the character encoding of a feed</a>.
46   * <p>
47   * @author Alejandro Abdelnur
48   *
49   */
50  public class XmlReader extends Reader {
51      private static final int PUSHBACK_MAX_SIZE = 1024;
52  
53      private static final String UTF_8 = "UTF-8";
54      private static final String US_ASCII = "US-ASCII";
55      private static final String UTF_16BE = "UTF-16BE";
56      private static final String UTF_16LE = "UTF-16LE";
57      private static final String UTF_16 = "UTF-16";
58  
59      private Reader _reader;
60      private String _encoding;
61  
62      /***
63       * Creates a Reader for a File.
64       * <p>
65       * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
66       * missing defaults to UTF-8.
67       * <p>
68       * @param file File to create a Reader from.
69       * @throws IOException thrown if there is a problem reading the file.
70       *
71       */
72      public XmlReader(File file) throws IOException {
73          this(new FileInputStream(file));
74      }
75  
76      /***
77       * Creates a Reader for a raw InputStream.
78       * <p>
79       * It follows the same logic used for files.
80       * <p>
81       * @param is InputStream to create a Reader from.
82       * @throws IOException thrown if there is a problem reading the stream.
83       *
84       */
85      public XmlReader(InputStream is) throws IOException {
86          doRawStream(is);
87      }
88  
89      /***
90       * Creates a Reader using the InputStream of a URL.
91       * <p>
92       * If the URL is not of type HTTP and there is not 'content-type' header in the fetched
93       * data it uses the same logic used for Files.
94       * <p>
95       * If the URL is a HTTP Url or there is a 'content-type' header in the fetched
96       * data it uses the same logic used for an InputStream with content-type.
97       * <p>
98       * @param url URL to create a Reader from.
99       * @throws IOException thrown if there is a problem reading the stream of the URL.
100      *
101      */
102     public XmlReader(URL url) throws IOException {
103         this(url.openConnection());
104     }
105 
106     /***
107      * Creates a Reader using the InputStream of a URLConnection.
108      * <p>
109      * If the URLConnection is not of type HttpURLConnection and there is not
110      * 'content-type' header in the fetched data it uses the same logic used for files.
111      * <p>
112      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched
113      * data it uses the same logic used for an InputStream with content-type.
114      * <p>
115      * @param conn URLConnection to create a Reader from.
116      * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
117      *
118      */
119     public XmlReader(URLConnection conn) throws IOException {
120         if (conn instanceof HttpURLConnection) {
121             doHttpStream(conn.getInputStream(),conn.getContentType());
122         }
123         else
124         if (conn.getContentType()!=null) {
125             doHttpStream(conn.getInputStream(),conn.getContentType());
126         }
127         else {
128             doRawStream(conn.getInputStream());
129         }
130     }
131 
132     /***
133      * Creates a Reader using an InputStream an the associated content-type header.
134      * <p>
135      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
136      * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
137      * prolog encoding uses the default encoding mandated by the content-type MIME type.
138      * <p>
139      * @param is InputStream to create the reader from.
140      * @param httpContentType content-type header to use for the resolution of the charset encoding.
141      * @throws IOException thrown if there is a problem reading the file.
142      *
143      */
144     public XmlReader(InputStream is,String httpContentType) throws IOException {
145         doHttpStream(is,httpContentType);
146     }
147 
148     /***
149      * Returns the charset encoding of the XmlReader.
150      * <p>
151      * @return charset encoding.
152      *
153      */
154     public String getEncoding() {
155         return _encoding;
156     }
157 
158     public int read(char[] buf,int offset,int len) throws IOException {
159         return _reader.read(buf,offset,len);
160     }
161 
162     /***
163      * Closes the XmlReader stream.
164      * <p>
165      * @throws IOException thrown if there was a problem closing the stream.
166      *
167      */
168     public void close() throws IOException {
169         _reader.close();
170     }
171 
172     private void doRawStream(InputStream is) throws IOException {
173         PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
174         String bomEnc = getBOMEncoding(pis);
175         String xmlGuessEnc =  getXMLGuessEncoding(pis);
176         String xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc);
177         _encoding = calculateRawEncoding(bomEnc,xmlGuessEnc,xmlEnc);
178         _reader = new InputStreamReader(pis,_encoding);
179     }
180 
181     private void doHttpStream(InputStream is,String httpContentType) throws IOException {
182         PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE);
183         String cTMime = getContentTypeMime(httpContentType);
184         String cTEnc  = getContentTypeEncoding(httpContentType);
185         String bomEnc = getBOMEncoding(pis);
186         String xmlGuessEnc =  getXMLGuessEncoding(pis);
187         String xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc);
188         _encoding = calculateHttpEncoding(cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc);
189         _reader = new InputStreamReader(pis,_encoding);
190     }
191 
192     private static String calculateRawEncoding(String bomEnc,String xmlGuessEnc,String xmlEnc) throws IOException {
193         String encoding;
194         if (bomEnc==null) {
195             if (xmlGuessEnc==null || xmlEnc==null) {
196                 encoding = UTF_8;
197             }
198             else
199             if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
200                 encoding = xmlGuessEnc;
201             }
202             else {
203                 encoding = xmlEnc;
204             }
205         }
206         else
207         if (bomEnc.equals(UTF_8)) {
208             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) {
209                 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
210             }
211             if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) {
212                 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
213             }
214             encoding = UTF_8;
215         }
216         else
217         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
218             if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) {
219                 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
220             }
221             if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
222                 throw new IOException(RAW_EX_1.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
223             }
224             encoding =bomEnc;
225         }
226         else {
227             throw new IOException(RAW_EX_2.format(new Object[]{bomEnc,xmlGuessEnc,xmlEnc}));
228         }
229         return encoding;
230     }
231 
232     private static String calculateHttpEncoding(String cTMime,String cTEnc,String bomEnc,String xmlGuessEnc,String xmlEnc) throws IOException {
233         boolean appXml = isAppXml(cTMime);
234         boolean textXml = isTextXml(cTMime);
235         String encoding;
236         if (appXml || textXml) {
237             if (cTEnc==null) {
238                 if (appXml) {
239                     encoding = calculateRawEncoding(bomEnc,xmlGuessEnc,xmlEnc);
240                 }
241                 else {
242                     encoding = US_ASCII;
243                 }
244             }
245             else
246             if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
247                 throw new IOException(HTTP_EX_1.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}));
248             }
249             else
250             if (cTEnc.equals(UTF_16)) {
251                 if (bomEnc!=null && bomEnc.startsWith(UTF_16)) {
252                     encoding = bomEnc;
253                 }
254                 else {
255                     throw new IOException(HTTP_EX_2.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}));
256                 }
257             }
258             else {
259                 encoding = cTEnc;
260             }
261         }
262         else {
263             throw new IOException(HTTP_EX_3.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}));
264         }
265         return encoding;
266     }
267 
268 /*
269     private static String calculateHttpEncoding(String cTMime,String cTEnc,String bomEnc,String xmlGuessEnc,String xmlEnc) throws IOException {
270         boolean appXml = isAppXml(cTMime);
271         boolean textXml = isTextXml(cTMime);
272         String encoding;
273         if (cTEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) && bomEnc!=null) {
274             throw new IOException(HTTP_EX_1.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}));
275         }
276 
277         if (appXml && cTEnc==null) {
278             encoding = calculateRawEncoding(bomEnc,xmlGuessEnc,xmlEnc);
279         }
280         else
281         if (textXml && cTEnc==null) {
282             encoding = US_ASCII;
283         }
284         else
285         if ((appXml || textXml) && cTEnc.startsWith(UTF_16)) {
286             if (bomEnc!=null && (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE))) {
287                 encoding = bomEnc;
288             }
289             else
290             if (bomEnc==null && xmlGuessEnc.startsWith(UTF_16)) {
291                 encoding = xmlGuessEnc;
292             }
293             else {
294                 throw new IOException(HTTP_EX_2.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}));
295             }
296         }
297         else
298         if (appXml) {
299             encoding = cTEnc;
300         }
301         else
302         if (textXml) {
303           encoding = cTEnc;
304         }
305         else {
306             throw new IOException(HTTP_EX_3.format(new Object[]{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}));
307         }
308         return encoding;
309     }
310 */
311     // returns MIME type or NULL if httpContentType is NULL
312     private static String getContentTypeMime(String httpContentType) {
313         String mime = null;
314         if (httpContentType!=null) {
315             int i = httpContentType.indexOf(";");
316             mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim();
317         }
318         return mime;
319     }
320 
321     private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)");
322 
323     // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
324     private static String getContentTypeEncoding(String httpContentType) {
325         String encoding = null;
326         if (httpContentType!=null) {
327             int i = httpContentType.indexOf(";");
328             if (i>-1) {
329                 String postMime = httpContentType.substring(i+1);
330                 Matcher m = CHARSET_PATTERN.matcher(postMime);
331                 encoding = (m.find()) ? m.group(1) : null;
332                 encoding = (encoding!=null) ? encoding.toUpperCase() : null;
333             }
334         }
335         return encoding;
336     }
337 
338     // returns the BOM in the stream, NULL if not present,
339     // if there was BOM the in the stream it is consumed
340     private static String getBOMEncoding(PushbackInputStream is) throws IOException {
341         String encoding = null;
342         int[] bytes = new int[3];
343         bytes[0] = is.read();
344         bytes[1] = is.read();
345         bytes[2] = is.read();
346 
347         if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
348             encoding = UTF_16BE;
349             is.unread(bytes[2]);
350         }
351         else
352         if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
353             encoding = UTF_16LE;
354             is.unread(bytes[2]);
355         }
356         else
357         if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
358             encoding = UTF_8;
359         }
360         else {
361             for (int i=bytes.length-1;i>=0;i--) {
362                 is.unread(bytes[i]);
363             }
364         }
365         return encoding;
366     }
367 
368     // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
369     private static String getXMLGuessEncoding(PushbackInputStream is) throws IOException {
370         String encoding = null;
371         int[] bytes = new int[4];
372         bytes[0] = is.read();
373         bytes[1] = is.read();
374         bytes[2] = is.read();
375         bytes[3] = is.read();
376         for (int i=bytes.length-1;i>=0;i--) {
377             is.unread(bytes[i]);
378         }
379 
380         if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) {
381                 encoding = UTF_16BE;
382         }
383         else
384         if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) {
385                 encoding = UTF_16LE;
386         }
387         else
388         if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) {
389             encoding = UTF_8;
390         }
391         return encoding;
392     }
393 
394     private static final Pattern ENCODING_PATTERN = Pattern.compile("^<//?xml.*encoding=\"(.*)\".*//?>");
395 
396     // returns the encoding declared in the <?xml encoding=...?>,  NULL if none
397     private static String getXMLPrologEncoding(PushbackInputStream is,String guessedEnc) throws IOException {
398         String encoding = null;
399         if (guessedEnc!=null) {
400             byte[] bytes = new byte[PUSHBACK_MAX_SIZE];
401             int bytesRead = is.read(bytes);
402             if (bytesRead>-1) {
403                 is.unread(bytes,0,bytesRead);
404                 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,bytesRead), guessedEnc);
405                 BufferedReader br = new BufferedReader(reader);
406                 String prolog = br.readLine();
407                 Matcher m = ENCODING_PATTERN.matcher(prolog);
408                 encoding = (m.find()) ? m.group(1).toUpperCase() : null;
409             }
410         }
411         return encoding;
412     }
413 
414     // indicates if the MIME type belongs to the APPLICATION XML family
415     private static boolean isAppXml(String mime) {
416         return mime!=null &&
417                (mime.equals("application/xml") ||
418                 mime.equals("application/xml-dtd") ||
419                 mime.equals("application/xml-external-parsed-entity") ||
420                 (mime.startsWith("application/") && mime.endsWith("+xml")));
421     }
422 
423     // indicates if the MIME type belongs to the TEXT XML family
424     private static boolean isTextXml(String mime) {
425         return mime!=null &&
426                (mime.equals("text/xml") ||
427                 mime.equals("text/xml-external-parsed-entity") ||
428                 (mime.startsWith("text/") && mime.endsWith("+xml")));
429     }
430 
431     private static final MessageFormat RAW_EX_1 = new MessageFormat(
432             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
433 
434     private static final MessageFormat RAW_EX_2 = new MessageFormat(
435             "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
436 
437     private static final MessageFormat HTTP_EX_1 = new MessageFormat(
438             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL");
439 
440     private static final MessageFormat HTTP_EX_2 = new MessageFormat(
441             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
442 
443     private static final MessageFormat HTTP_EX_3 = new MessageFormat(
444             "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME");
445 
446 }