View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.io;
18  
19  import java.io.ByteArrayInputStream;
20  import java.io.File;
21  import java.io.FileNotFoundException;
22  import java.io.FileReader;
23  import java.io.IOException;
24  import java.io.Reader;
25  import java.util.List;
26  import java.util.Map;
27  import java.util.WeakHashMap;
28  
29  import org.jdom.Document;
30  import org.jdom.JDOMException;
31  import org.jdom.input.DOMBuilder;
32  import org.jdom.input.JDOMParseException;
33  import org.xml.sax.EntityResolver;
34  import org.xml.sax.InputSource;
35  import org.xml.sax.SAXNotRecognizedException;
36  import org.xml.sax.SAXNotSupportedException;
37  import org.xml.sax.XMLReader;
38  
39  import com.sun.syndication.feed.WireFeed;
40  import com.sun.syndication.io.impl.FeedParsers;
41  import com.sun.syndication.io.impl.XmlFixerReader;
42  
43  /***
44   * Parses an XML document (File, InputStream, Reader, W3C SAX InputSource, W3C DOM Document or JDom DOcument)
45   * into an WireFeed (RSS/Atom).
46   * <p>
47   * It accepts all flavors of RSS (0.90, 0.91, 0.92, 0.93, 0.94, 1.0 and 2.0) and
48   * Atom 0.3 feeds. Parsers are plugable (they must implement the WireFeedParser interface).
49   * <p>
50   * The WireFeedInput useds liberal parsers.
51   * <p>
52   * @author Alejandro Abdelnur
53   *
54   */
55  public class WireFeedInput {
56  
57      private static Map clMap = new WeakHashMap();
58  
59      private static FeedParsers getFeedParsers() {
60          synchronized(WireFeedInput.class) {
61              FeedParsers parsers = (FeedParsers)
62                  clMap.get(Thread.currentThread().getContextClassLoader());
63              if (parsers == null) {
64                  parsers = new FeedParsers();
65                  clMap.put(Thread.currentThread().getContextClassLoader(), parsers);
66              }
67              return parsers;
68          }
69      }
70  
71      private static final InputSource EMPTY_INPUTSOURCE = new InputSource(new ByteArrayInputStream(new byte[0]));
72      private static final EntityResolver RESOLVER = new EmptyEntityResolver();
73  
74      private static class EmptyEntityResolver implements EntityResolver {
75          public InputSource resolveEntity(String publicId, String systemId) {
76              if(systemId != null && systemId.endsWith(".dtd")) return EMPTY_INPUTSOURCE;
77              return null;
78          }
79      }
80  
81      private boolean _validate;
82  
83      private boolean _xmlHealerOn;
84  
85      /***
86       * Returns the list of supported input feed types.
87       * <p>
88       * @see WireFeed for details on the format of these strings.
89       * <p>
90       * @return a list of String elements with the supported input feed types.
91       *
92       */
93      public static List getSupportedFeedTypes() {
94          return getFeedParsers().getSupportedFeedTypes();
95      }
96  
97      /***
98       * Creates a WireFeedInput instance with input validation turned off.
99       * <p>
100      *
101      */
102     public WireFeedInput() {
103         this (false);
104     }
105 
106     /***
107      * Creates a WireFeedInput instance.
108      * <p>
109      * @param validate indicates if the input should be validated. NOT IMPLEMENTED YET (validation does not happen)
110      *
111      */
112     public WireFeedInput(boolean validate) {
113         _validate = false; // TODO FIX THIS THINGY
114         _xmlHealerOn = true;
115     }
116 
117     /***
118      * Enables XML healing in the WiredFeedInput instance.
119      * <p>
120      * Healing trims leading chars from the stream (empty spaces and comments) until the XML prolog.
121      * <p>
122      * Healing resolves HTML entities (from literal to code number) in the reader.
123      * <p>
124      * The healing is done only with the build(File) and build(Reader) signatures.
125      * <p>
126      * By default is TRUE.
127      * <p>
128      * @param heals TRUE enables stream healing, FALSE disables it.
129      *
130      */
131     public void setXmlHealerOn(boolean heals) {
132         _xmlHealerOn = heals;
133     }
134 
135     /***
136      * Indicates if the WiredFeedInput instance will XML heal (if necessary) the character stream.
137      * <p>
138      * Healing trims leading chars from the stream (empty spaces and comments) until the XML prolog.
139      * <p>
140      * Healing resolves HTML entities (from literal to code number) in the reader.
141      * <p>
142      * The healing is done only with the build(File) and build(Reader) signatures.
143      * <p>
144      * By default is TRUE.
145      * <p>
146      * @return TRUE if healing is enabled, FALSE if not.
147      *
148      */
149     public boolean getXmlHealerOn() {
150         return _xmlHealerOn;
151     }
152 
153     /***
154      * Builds an WireFeed (RSS or Atom) from a file.
155      * <p>
156      * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom.Document)'.
157      * <p>
158      * @param file file to read to create the WireFeed.
159      * @return the WireFeed read from the file.
160      * @throws FileNotFoundException thrown if the file could not be found.
161      * @throws IOException thrown if there is problem reading the file.
162      * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
163      * @throws FeedException if the feed could not be parsed
164      *
165      */
166     public WireFeed build(File file) throws FileNotFoundException,IOException,IllegalArgumentException,FeedException {
167         WireFeed feed;
168         Reader reader = new FileReader(file);
169         if (_xmlHealerOn) {
170             reader = new XmlFixerReader(reader);
171         }
172         feed = build(reader);
173         reader.close();
174         return feed;
175     }
176 
177     /***
178      * Builds an WireFeed (RSS or Atom) from an Reader.
179      * <p>
180      * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom.Document)'.
181      * <p>
182      * @param reader Reader to read to create the WireFeed.
183      * @return the WireFeed read from the Reader.
184      * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
185      * @throws FeedException if the feed could not be parsed
186      *
187      */
188     public WireFeed build(Reader reader) throws IllegalArgumentException,FeedException {
189     	SAXBuilder saxBuilder = createSAXBuilder();
190         try {
191             if (_xmlHealerOn) {
192                 reader = new XmlFixerReader(reader);
193             }            
194             Document document = saxBuilder.build(reader);
195             return build(document);
196         }
197         catch (JDOMParseException ex) {
198             throw new ParsingFeedException("Invalid XML: " + ex.getMessage(), ex);
199         }
200         catch (IllegalArgumentException ex) {
201             throw ex;
202         }
203         catch (Exception ex) {
204             throw new ParsingFeedException("Invalid XML",ex);
205         }
206     }
207 
208     /***
209      * Builds an WireFeed (RSS or Atom) from an W3C SAX InputSource.
210      * <p>
211      * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom.Document)'.
212      * <p>
213      * @param is W3C SAX InputSource to read to create the WireFeed.
214      * @return the WireFeed read from the W3C SAX InputSource.
215      * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
216      * @throws FeedException if the feed could not be parsed
217      *
218      */
219     public WireFeed build(InputSource is) throws IllegalArgumentException,FeedException {
220     	SAXBuilder saxBuilder = createSAXBuilder();
221         try {
222             Document document = saxBuilder.build(is);
223             return build(document);
224         }
225         catch (JDOMParseException ex) {
226             throw new ParsingFeedException("Invalid XML: " + ex.getMessage(), ex);
227         }
228         catch (IllegalArgumentException ex) {
229             throw ex;
230         }
231         catch (Exception ex) {
232             throw new ParsingFeedException("Invalid XML",ex);
233         }
234     }
235 
236     /***
237      * Builds an WireFeed (RSS or Atom) from an W3C DOM document.
238      * <p>
239      * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom.Document)'.
240      * <p>
241      * @param document W3C DOM document to read to create the WireFeed.
242      * @return the WireFeed read from the W3C DOM document.
243      * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
244      * @throws FeedException if the feed could not be parsed
245      *
246      */
247     public WireFeed build(org.w3c.dom.Document document) throws IllegalArgumentException,FeedException {
248         DOMBuilder domBuilder = new DOMBuilder();        
249         try {
250             Document jdomDoc = domBuilder.build(document);
251             return build(jdomDoc);
252         }
253         catch (IllegalArgumentException ex) {
254             throw ex;
255         }
256         catch (Exception ex) {
257             throw new ParsingFeedException("Invalid XML",ex);
258         }
259     }
260 
261     /***
262      * Builds an WireFeed (RSS or Atom) from an JDOM document.
263      * <p>
264      * NOTE: All other build methods delegate to this method.
265      * <p>
266      * @param document JDOM document to read to create the WireFeed.
267      * @return the WireFeed read from the JDOM document.
268      * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
269      * @throws FeedException if the feed could not be parsed
270      *
271      */
272     public WireFeed build(Document document) throws IllegalArgumentException,FeedException {
273         WireFeedParser parser = getFeedParsers().getParserFor(document);
274         if (parser==null) {
275             throw new IllegalArgumentException("Invalid document");
276         }
277         return parser.parse(document, _validate);
278     }
279 
280     /***
281      * Creates and sets up a org.jdom.input.SAXBuilder for parsing.
282      * 
283      * @return a new org.jdom.input.SAXBuilder object
284      */
285     protected SAXBuilder createSAXBuilder() {
286         SAXBuilder saxBuilder = new SAXBuilder(_validate);        
287         saxBuilder.setEntityResolver(RESOLVER);
288 
289         //
290         // This code is needed to fix the security problem outlined in http://www.securityfocus.com/archive/1/297714
291         //
292         // Unfortunately there isn't an easy way to check if an XML parser supports a particular feature, so
293         // we need to set it and catch the exception if it fails. We also need to subclass the JDom SAXBuilder 
294         // class in order to get access to the underlying SAX parser - otherwise the features don't get set until
295         // we are already building the document, by which time it's too late to fix the problem.
296         //
297         // Crimson is one parser which is known not to support these features.
298 		try {
299 			XMLReader parser = saxBuilder.createParser();
300 			try {				
301 				parser.setFeature("http://xml.org/sax/features/external-general-entities", false);
302 				saxBuilder.setFeature("http://xml.org/sax/features/external-general-entities", false);
303 			} catch (SAXNotRecognizedException e) {
304 				// ignore
305 			} catch (SAXNotSupportedException e) {
306 				// ignore
307 			}
308 			
309 			try {
310 				parser.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
311 				saxBuilder.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
312 			} catch (SAXNotRecognizedException e) {
313 				// ignore
314 			} catch (SAXNotSupportedException e) {
315 				// ignore
316 			}
317 
318 			try {
319 				parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
320 				saxBuilder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
321 			} catch (SAXNotRecognizedException e) {
322 				// ignore
323 			} catch (SAXNotSupportedException e) {
324 				// ignore
325 			}
326 			
327 		} catch (JDOMException e) {
328 			throw new IllegalStateException("JDOM could not create a SAX parser");
329 		}
330 
331 		saxBuilder.setExpandEntities(false);    
332         return saxBuilder;
333     }
334 }