1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.unittest;
18  
19  import com.sun.syndication.io.XmlReader;
20  import junit.framework.TestCase;
21  
22  import java.io.*;
23  import java.text.MessageFormat;
24  import java.util.HashMap;
25  import java.util.Map;
26  
27  /***
28   * @author pat, tucu
29   *
30   */
31  public class TestXmlReader extends TestCase {
32      private static final String XML5 = "xml-prolog-encoding-spaced-single-quotes";
33      private static final String XML4 = "xml-prolog-encoding-single-quotes";
34      private static final String XML3 = "xml-prolog-encoding-double-quotes";
35      private static final String XML2 = "xml-prolog";
36      private static final String XML1 = "xml";
37  
38      public static void main(String[] args) throws Exception {
39          TestXmlReader test = new TestXmlReader();
40          test.testRawBom();
41          test.testRawNoBom();
42          test.testHttp();
43      }
44  
45      protected void _testRawNoBomValid(String encoding) throws Exception {
46          InputStream is = getXmlStream("no-bom",XML1,encoding,encoding);
47          XmlReader xmlReader = new XmlReader(is,false);
48          assertEquals(xmlReader.getEncoding(),"UTF-8");
49  
50          is = getXmlStream("no-bom",XML2,encoding,encoding);
51          xmlReader = new XmlReader(is);
52          assertEquals(xmlReader.getEncoding(),"UTF-8");
53  
54          is = getXmlStream("no-bom",XML3,encoding,encoding);
55          xmlReader = new XmlReader(is);
56          assertEquals(xmlReader.getEncoding(),encoding);
57  
58          is = getXmlStream("no-bom", XML4, encoding, encoding);
59          xmlReader = new XmlReader(is);
60          assertEquals(xmlReader.getEncoding(), encoding);
61  
62          is = getXmlStream("no-bom", XML5, encoding, encoding);
63          xmlReader = new XmlReader(is);
64          assertEquals(xmlReader.getEncoding(), encoding);
65      }
66  
67      protected void _testRawNoBomInvalid(String encoding) throws Exception {
68          InputStream is = getXmlStream("no-bom",XML3,encoding,encoding);
69          try {
70              XmlReader xmlReader = new XmlReader(is,false);
71              fail("It should have failed");
72          }
73          catch (IOException ex) {
74              assertTrue(ex.getMessage().indexOf("Invalid encoding,")>-1);
75          }
76       }
77  
78      public void testRawNoBom() throws Exception {
79          _testRawNoBomValid("US-ASCII");
80          _testRawNoBomValid("UTF-8");
81          _testRawNoBomValid("ISO-8859-1");
82      }
83  
84      protected void _testRawBomValid(String encoding) throws Exception {
85          InputStream is = getXmlStream(encoding+"-bom",XML3,encoding,encoding);
86          XmlReader xmlReader = new XmlReader(is,false);
87          if (!encoding.equals("UTF-16")) {
88              assertEquals(xmlReader.getEncoding(),encoding);
89          }
90          else {
91              assertEquals(xmlReader.getEncoding().substring(0,encoding.length()),encoding);
92          }
93      }
94  
95      protected void _testRawBomInvalid(String bomEnc,String streamEnc,String prologEnc) throws Exception {
96          InputStream is = getXmlStream(bomEnc,XML3,streamEnc,prologEnc);
97          try {
98              XmlReader xmlReader = new XmlReader(is,false);
99              fail("It should have failed for BOM "+bomEnc+", streamEnc "+streamEnc+" and prologEnc "+prologEnc);
100         }
101         catch (IOException ex) {
102             assertTrue(ex.getMessage().indexOf("Invalid encoding,")>-1);
103         }
104      }
105 
106     public void testRawBom() throws Exception {
107         _testRawBomValid("UTF-8");
108         _testRawBomValid("UTF-16BE");
109         _testRawBomValid("UTF-16LE");
110         _testRawBomValid("UTF-16");
111 
112         _testRawBomInvalid("UTF-8-bom","US-ASCII","US-ASCII");
113         _testRawBomInvalid("UTF-8-bom","ISO-8859-1","ISO-8859-1");
114         _testRawBomInvalid("UTF-8-bom","UTF-8","UTF-16");
115         _testRawBomInvalid("UTF-8-bom","UTF-8","UTF-16BE");
116         _testRawBomInvalid("UTF-8-bom","UTF-8","UTF-16LE");
117         _testRawBomInvalid("UTF-16BE-bom","UTF-16BE","UTF-16LE");
118         _testRawBomInvalid("UTF-16LE-bom","UTF-16LE","UTF-16BE");
119         _testRawBomInvalid("UTF-16LE-bom","UTF-16LE","UTF-8");
120     }
121 
122     public void testHttp() throws Exception {
123         _testHttpValid("application/xml","no-bom","US-ASCII",null);
124         _testHttpValid("application/xml","UTF-8-bom","US-ASCII",null);
125         _testHttpValid("application/xml","UTF-8-bom","UTF-8",null);
126         _testHttpValid("application/xml","UTF-8-bom","UTF-8","UTF-8");
127         _testHttpValid("application/xml;charset=UTF-8","UTF-8-bom","UTF-8",null);
128         _testHttpValid("application/xml;charset=\"UTF-8\"","UTF-8-bom","UTF-8",null);
129         _testHttpValid("application/xml;charset='UTF-8'","UTF-8-bom","UTF-8",null);
130         _testHttpValid("application/xml;charset=UTF-8","UTF-8-bom","UTF-8","UTF-8");
131         _testHttpValid("application/xml;charset=UTF-16","UTF-16BE-bom","UTF-16BE",null);
132         _testHttpValid("application/xml;charset=UTF-16","UTF-16BE-bom","UTF-16BE","UTF-16");
133         _testHttpValid("application/xml;charset=UTF-16","UTF-16BE-bom","UTF-16BE","UTF-16BE");
134 
135         _testHttpInvalid("application/xml;charset=UTF-16BE","UTF-16BE-bom","UTF-16BE",null);
136         _testHttpInvalid("application/xml;charset=UTF-16BE","UTF-16BE-bom","UTF-16BE","UTF-16");
137         _testHttpInvalid("application/xml;charset=UTF-16BE","UTF-16BE-bom","UTF-16BE","UTF-16BE");
138         _testHttpInvalid("application/xml","UTF-8-bom","US-ASCII","US-ASCII");
139         _testHttpInvalid("application/xml;charset=UTF-16","UTF-16LE","UTF-8","UTF-8");
140         _testHttpInvalid("application/xml;charset=UTF-16","no-bom","UTF-16BE","UTF-16BE");
141 
142         _testHttpValid("text/xml","no-bom","US-ASCII",null);
143         _testHttpValid("text/xml;charset=UTF-8","UTF-8-bom","UTF-8","UTF-8");
144         _testHttpValid("text/xml;charset=UTF-8","UTF-8-bom","UTF-8",null);
145         _testHttpValid("text/xml;charset=UTF-16","UTF-16BE-bom","UTF-16BE",null);
146         _testHttpValid("text/xml;charset=UTF-16","UTF-16BE-bom","UTF-16BE","UTF-16");
147         _testHttpValid("text/xml;charset=UTF-16","UTF-16BE-bom","UTF-16BE","UTF-16BE");
148         _testHttpValid("text/xml","UTF-8-bom","US-ASCII",null);
149 
150         _testAlternateDefaultEncoding("application/xml", "UTF-8-bom", "UTF-8", null, null);
151         _testAlternateDefaultEncoding("application/xml", "no-bom", "US-ASCII", null, "US-ASCII");
152         _testAlternateDefaultEncoding("application/xml", "UTF-8-bom", "UTF-8", null, "UTF-8");
153         _testAlternateDefaultEncoding("text/xml", "no-bom", "US-ASCII", null, null);
154         _testAlternateDefaultEncoding("text/xml", "no-bom", "US-ASCII", null, "US-ASCII");
155         _testAlternateDefaultEncoding("text/xml", "no-bom", "US-ASCII", null, "UTF-8");
156 
157         _testHttpInvalid("text/xml;charset=UTF-16BE","UTF-16BE-bom","UTF-16BE",null);
158         _testHttpInvalid("text/xml;charset=UTF-16BE","UTF-16BE-bom","UTF-16BE","UTF-16");
159         _testHttpInvalid("text/xml;charset=UTF-16BE","UTF-16BE-bom","UTF-16BE","UTF-16BE");
160         _testHttpInvalid("text/xml;charset=UTF-16","no-bom","UTF-16BE","UTF-16BE");
161         _testHttpInvalid("text/xml;charset=UTF-16","no-bom","UTF-16BE",null);
162 
163         _testHttpLenient("text/xml","no-bom","US-ASCII",null, "US-ASCII");
164         _testHttpLenient("text/xml;charset=UTF-8","UTF-8-bom","UTF-8","UTF-8", "UTF-8");
165         _testHttpLenient("text/xml;charset=UTF-8","UTF-8-bom","UTF-8",null, "UTF-8");
166         _testHttpLenient("text/xml;charset=UTF-16","UTF-16BE-bom","UTF-16BE",null, "UTF-16BE");
167         _testHttpLenient("text/xml;charset=UTF-16","UTF-16BE-bom","UTF-16BE","UTF-16", "UTF-16");
168         _testHttpLenient("text/xml;charset=UTF-16","UTF-16BE-bom","UTF-16BE","UTF-16BE", "UTF-16BE");
169         _testHttpLenient("text/xml","UTF-8-bom","US-ASCII",null, "US-ASCII");
170 
171         _testHttpLenient("text/xml;charset=UTF-16BE","UTF-16BE-bom","UTF-16BE",null, "UTF-16BE");
172         _testHttpLenient("text/xml;charset=UTF-16BE","UTF-16BE-bom","UTF-16BE","UTF-16", "UTF-16");
173         _testHttpLenient("text/xml;charset=UTF-16BE","UTF-16BE-bom","UTF-16BE","UTF-16BE", "UTF-16BE");
174         _testHttpLenient("text/xml;charset=UTF-16","no-bom","UTF-16BE","UTF-16BE", "UTF-16BE");
175         _testHttpLenient("text/xml;charset=UTF-16","no-bom","UTF-16BE",null, "UTF-16");
176 
177         _testHttpLenient("text/html","no-bom","US-ASCII","US-ASCII", "US-ASCII");
178         _testHttpLenient("text/html","no-bom","US-ASCII",null, "US-ASCII");
179         _testHttpLenient("text/html;charset=UTF-8","no-bom","US-ASCII","UTF-8", "UTF-8");
180         _testHttpLenient("text/html;charset=UTF-16BE","no-bom","US-ASCII","UTF-8", "UTF-8");
181     }
182 
183     public void _testAlternateDefaultEncoding(String cT, String bomEnc, String streamEnc, String prologEnc, String alternateEnc) throws Exception {
184         try {
185             InputStream is = getXmlStream(bomEnc, (prologEnc == null) ? XML1 : XML3, streamEnc, prologEnc);
186             XmlReader.setDefaultEncoding(alternateEnc);
187             XmlReader xmlReader = new XmlReader(is, cT, false);
188             if (!streamEnc.equals("UTF-16")) {
189                 // we can not assert things here becuase UTF-8, US-ASCII and ISO-8859-1 look alike for the chars used for detection
190             }
191             else {
192                 String enc = (alternateEnc != null) ? alternateEnc : streamEnc;
193                 assertEquals(xmlReader.getEncoding().substring(0, streamEnc.length()), streamEnc);
194             }
195         }
196         finally {
197             XmlReader.setDefaultEncoding(null);
198         }
199     }
200 
201     public void _testHttpValid(String cT, String bomEnc, String streamEnc, String prologEnc) throws Exception {
202         InputStream is = getXmlStream(bomEnc,(prologEnc==null)?XML1 :XML3,streamEnc,prologEnc);
203         XmlReader xmlReader = new XmlReader(is,cT,false);
204         if (!streamEnc.equals("UTF-16")) {
205             // we can not assert things here becuase UTF-8, US-ASCII and ISO-8859-1 look alike for the chars used for detection
206         }
207         else {
208             assertEquals(xmlReader.getEncoding().substring(0,streamEnc.length()),streamEnc);
209         }
210     }
211 
212     protected void _testHttpInvalid(String cT,String bomEnc,String streamEnc,String prologEnc) throws Exception {
213         InputStream is = getXmlStream(bomEnc,(prologEnc==null)?XML2 :XML3,streamEnc,prologEnc);
214         try {
215             new XmlReader(is,cT,false);
216             fail("It should have failed for HTTP Content-type "+cT+", BOM "+bomEnc+", streamEnc "+streamEnc+" and prologEnc "+prologEnc);
217         }
218         catch (IOException ex) {
219             assertTrue(ex.getMessage().indexOf("Invalid encoding,")>-1);
220         }
221      }
222 
223     protected void _testHttpLenient(String cT, String bomEnc, String streamEnc, String prologEnc, String shouldbe) throws Exception {
224         InputStream is = getXmlStream(bomEnc,(prologEnc==null)?XML2 :XML3,streamEnc,prologEnc);
225         XmlReader xmlReader = new XmlReader(is,cT,true);
226         assertEquals(xmlReader.getEncoding(),shouldbe);
227     }
228 
229     private static final String ENCODING_ATTRIBUTE_XML =
230         "<?xml version=\"1.0\" ?> \n" +
231         "<atom:feed xmlns:atom=\"http://www.w3.org/2005/Atom\">\n" +
232         "\n" +
233         "  <atom:entry>\n" +
234         "    <atom:title encoding=\"base64\"><![CDATA\n" +
235         "aW5nTGluZSIgLz4";
236 
237     public void testEncodingAttributeXML() throws Exception {
238         InputStream is = new ByteArrayInputStream(ENCODING_ATTRIBUTE_XML.getBytes());
239         XmlReader xmlReader = new XmlReader(is, "", true);
240         assertEquals(xmlReader.getEncoding(), "UTF-8");
241     }
242     
243     // XML Stream generator
244 
245     private static final int[] NO_BOM_BYTES = {};
246     private static final int[] UTF_16BE_BOM_BYTES = {0xFE,0xFF};
247     private static final int[] UTF_16LE_BOM_BYTES = {0xFF,0XFE};
248     private static final int[] UTF_8_BOM_BYTES = {0xEF,0xBB,0xBF};
249 
250     private static final Map BOMs = new HashMap();
251 
252     static {
253         BOMs.put("no-bom",NO_BOM_BYTES);
254         BOMs.put("UTF-16BE-bom",UTF_16BE_BOM_BYTES);
255         BOMs.put("UTF-16LE-bom",UTF_16LE_BOM_BYTES);
256         BOMs.put("UTF-16-bom",NO_BOM_BYTES); // it's added by the writer
257         BOMs.put("UTF-8-bom",UTF_8_BOM_BYTES);
258     }
259 
260     private static final MessageFormat XML = new MessageFormat(
261             "<root>{2}</root>");
262     private static final MessageFormat XML_WITH_PROLOG = new MessageFormat(
263             "<?xml version=\"1.0\"?>\n<root>{2}</root>");
264     private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_DOUBLE_QUOTES = new MessageFormat(
265             "<?xml version=\"1.0\" encoding=\"{1}\"?>\n<root>{2}</root>");
266     private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_SINGLE_QUOTES = new MessageFormat(
267             "<?xml version=\"1.0\" encoding=''{1}''?>\n<root>{2}</root>");
268   private static final MessageFormat XML_WITH_PROLOG_AND_ENCODING_SPACED_SINGLE_QUOTES = new MessageFormat(
269             "<?xml version=\"1.0\" encoding =  \t \n \r''{1}''?>\n<root>{2}</root>");
270 
271   private static final MessageFormat INFO = new MessageFormat(
272             "\nBOM : {0}\nDoc : {1}\nStream Enc : {2}\nProlog Enc : {3}\n");
273 
274     private static final Map XMLs = new HashMap();
275 
276     static {
277         XMLs.put(XML1, XML);
278         XMLs.put(XML2, XML_WITH_PROLOG);
279         XMLs.put(XML3, XML_WITH_PROLOG_AND_ENCODING_DOUBLE_QUOTES);
280         XMLs.put(XML4, XML_WITH_PROLOG_AND_ENCODING_SINGLE_QUOTES);
281         XMLs.put(XML5, XML_WITH_PROLOG_AND_ENCODING_SPACED_SINGLE_QUOTES);
282     }
283 
284     /***
285      *
286      * @param bomType no-bom, UTF-16BE-bom, UTF-16LE-bom, UTF-8-bom
287      * @param xmlType xml, xml-prolog, xml-prolog-charset
288      * @return XML stream
289      */
290     protected InputStream getXmlStream(String bomType,String xmlType,String streamEnc,String prologEnc) throws IOException {
291         ByteArrayOutputStream baos = new ByteArrayOutputStream(1024);
292         int[] bom = (int[]) BOMs.get(bomType);
293         if (bom==null) {
294             bom = new int[0];
295         }
296         MessageFormat xml = (MessageFormat) XMLs.get(xmlType);
297         for (int i=0;i<bom.length;i++) {
298             baos.write(bom[i]);
299         }
300         Writer writer = new OutputStreamWriter(baos,streamEnc);
301         String info = INFO.format(new Object[]{bomType,xmlType,prologEnc});
302         String xmlDoc = xml.format(new Object[]{streamEnc,prologEnc,info});
303         writer.write(xmlDoc);
304 
305         // PADDDING TO TEST THINGS WORK BEYOND PUSHBACK_SIZE
306         writer.write("<da>\n");
307         for (int i=0;i<10000;i++) {
308             writer.write("<do/>\n");
309         }
310         writer.write("</da>\n");
311 
312         writer.close();
313         return new ByteArrayInputStream(baos.toByteArray());
314     }
315 
316 
317 }