View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.io;
18  
19  import java.io.ByteArrayInputStream;
20  import java.io.File;
21  import java.io.FileNotFoundException;
22  import java.io.FileReader;
23  import java.io.IOException;
24  import java.io.Reader;
25  import java.util.List;
26  
27  import org.jdom.Document;
28  import org.jdom.JDOMException;
29  import org.jdom.input.DOMBuilder;
30  import org.jdom.input.JDOMParseException;
31  import org.xml.sax.EntityResolver;
32  import org.xml.sax.InputSource;
33  import org.xml.sax.SAXNotRecognizedException;
34  import org.xml.sax.SAXNotSupportedException;
35  import org.xml.sax.XMLReader;
36  
37  import com.sun.syndication.feed.WireFeed;
38  import com.sun.syndication.io.impl.FeedParsers;
39  import com.sun.syndication.io.impl.XmlFixerReader;
40  
41  /***
42   * Parses an XML document (File, InputStream, Reader, W3C SAX InputSource, W3C DOM Document or JDom DOcument)
43   * into an WireFeed (RSS/Atom).
44   * <p>
45   * It accepts all flavors of RSS (0.90, 0.91, 0.92, 0.93, 0.94, 1.0 and 2.0) and
46   * Atom 0.3 feeds. Parsers are plugable (they must implement the WireFeedParser interface).
47   * <p>
48   * The WireFeedInput useds liberal parsers.
49   * <p>
50   * @author Alejandro Abdelnur
51   *
52   */
53  public class WireFeedInput {
54      private static FeedParsers FEED_PARSERS = new FeedParsers();
55      private static final InputSource EMPTY_INPUTSOURCE = new InputSource(new ByteArrayInputStream(new byte[0]));
56      private static final EntityResolver RESOLVER = new EmptyEntityResolver();
57  
58      private static class EmptyEntityResolver implements EntityResolver {
59          public InputSource resolveEntity(String publicId, String systemId) {
60              if(systemId != null && systemId.endsWith(".dtd")) return EMPTY_INPUTSOURCE;
61              return null;
62          }
63      }
64  
65      private boolean _validate;
66  
67      private boolean _xmlHealerOn;
68  
69      /***
70       * Returns the list of supported input feed types.
71       * <p>
72       * @see WireFeed for details on the format of these strings.
73       * <p>
74       * @return a list of String elements with the supported input feed types.
75       *
76       */
77      public static List getSupportedFeedTypes() {
78          return FEED_PARSERS.getSupportedFeedTypes();
79      }
80  
81      /***
82       * Creates a WireFeedInput instance with input validation turned off.
83       * <p>
84       *
85       */
86      public WireFeedInput() {
87          this (false);
88      }
89  
90      /***
91       * Creates a WireFeedInput instance.
92       * <p>
93       * @param validate indicates if the input should be validated. NOT IMPLEMENTED YET (validation does not happen)
94       *
95       */
96      public WireFeedInput(boolean validate) {
97          _validate = false; // TODO FIX THIS THINGY
98          _xmlHealerOn = true;
99      }
100 
101     /***
102      * Enables XML healing in the WiredFeedInput instance.
103      * <p>
104      * Healing trims leading chars from the stream (empty spaces and comments) until the XML prolog.
105      * <p>
106      * Healing resolves HTML entities (from literal to code number) in the reader.
107      * <p>
108      * The healing is done only with the build(File) and build(Reader) signatures.
109      * <p>
110      * By default is TRUE.
111      * <p>
112      * @param heals TRUE enables stream healing, FALSE disables it.
113      *
114      */
115     public void setXmlHealerOn(boolean heals) {
116         _xmlHealerOn = heals;
117     }
118 
119     /***
120      * Indicates if the WiredFeedInput instance will XML heal (if necessary) the character stream.
121      * <p>
122      * Healing trims leading chars from the stream (empty spaces and comments) until the XML prolog.
123      * <p>
124      * Healing resolves HTML entities (from literal to code number) in the reader.
125      * <p>
126      * The healing is done only with the build(File) and build(Reader) signatures.
127      * <p>
128      * By default is TRUE.
129      * <p>
130      * @return TRUE if healing is enabled, FALSE if not.
131      *
132      */
133     public boolean getXmlHealerOn() {
134         return _xmlHealerOn;
135     }
136 
137     /***
138      * Builds an WireFeed (RSS or Atom) from a file.
139      * <p>
140      * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom.Document)'.
141      * <p>
142      * @param file file to read to create the WireFeed.
143      * @return the WireFeed read from the file.
144      * @throws FileNotFoundException thrown if the file could not be found.
145      * @throws IOException thrown if there is problem reading the file.
146      * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
147      * @throws FeedException if the feed could not be parsed
148      *
149      */
150     public WireFeed build(File file) throws FileNotFoundException,IOException,IllegalArgumentException,FeedException {
151         WireFeed feed;
152         Reader reader = new FileReader(file);
153         if (_xmlHealerOn) {
154             reader = new XmlFixerReader(reader);
155         }
156         feed = build(reader);
157         reader.close();
158         return feed;
159     }
160 
161     /***
162      * Builds an WireFeed (RSS or Atom) from an Reader.
163      * <p>
164      * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom.Document)'.
165      * <p>
166      * @param reader Reader to read to create the WireFeed.
167      * @return the WireFeed read from the Reader.
168      * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
169      * @throws FeedException if the feed could not be parsed
170      *
171      */
172     public WireFeed build(Reader reader) throws IllegalArgumentException,FeedException {
173     	SAXBuilder saxBuilder = createSAXBuilder();
174         try {
175             if (_xmlHealerOn) {
176                 reader = new XmlFixerReader(reader);
177             }            
178             Document document = saxBuilder.build(reader);
179             return build(document);
180         }
181         catch (JDOMParseException ex) {
182             throw new ParsingFeedException("Invalid XML: " + ex.getMessage(), ex);
183         }
184         catch (Exception ex) {
185             throw new ParsingFeedException("Invalid XML",ex);
186         }
187     }
188 
189     /***
190      * Builds an WireFeed (RSS or Atom) from an W3C SAX InputSource.
191      * <p>
192      * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom.Document)'.
193      * <p>
194      * @param is W3C SAX InputSource to read to create the WireFeed.
195      * @return the WireFeed read from the W3C SAX InputSource.
196      * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
197      * @throws FeedException if the feed could not be parsed
198      *
199      */
200     public WireFeed build(InputSource is) throws IllegalArgumentException,FeedException {
201     	SAXBuilder saxBuilder = createSAXBuilder();
202         try {
203             Document document = saxBuilder.build(is);
204             return build(document);
205         }
206         catch (JDOMParseException ex) {
207             throw new ParsingFeedException("Invalid XML: " + ex.getMessage(), ex);
208         }
209         catch (Exception ex) {
210             throw new ParsingFeedException("Invalid XML",ex);
211         }
212     }
213 
214     /***
215      * Builds an WireFeed (RSS or Atom) from an W3C DOM document.
216      * <p>
217      * NOTE: This method delages to the 'AsbtractFeed WireFeedInput#build(org.jdom.Document)'.
218      * <p>
219      * @param document W3C DOM document to read to create the WireFeed.
220      * @return the WireFeed read from the W3C DOM document.
221      * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
222      * @throws FeedException if the feed could not be parsed
223      *
224      */
225     public WireFeed build(org.w3c.dom.Document document) throws IllegalArgumentException,FeedException {
226         DOMBuilder domBuilder = new DOMBuilder();        
227         try {
228             Document jdomDoc = domBuilder.build(document);
229             return build(jdomDoc);
230         }
231         catch (Exception ex) {
232             throw new ParsingFeedException("Invalid XML",ex);
233         }
234     }
235 
236     /***
237      * Builds an WireFeed (RSS or Atom) from an JDOM document.
238      * <p>
239      * NOTE: All other build methods delegate to this method.
240      * <p>
241      * @param document JDOM document to read to create the WireFeed.
242      * @return the WireFeed read from the JDOM document.
243      * @throws IllegalArgumentException thrown if feed type could not be understood by any of the underlying parsers.
244      * @throws FeedException if the feed could not be parsed
245      *
246      */
247     public WireFeed build(Document document) throws IllegalArgumentException,FeedException {
248         WireFeedParser parser = FEED_PARSERS.getParserFor(document);
249         if (parser==null) {
250             throw new IllegalArgumentException("Invalid document");
251         }
252         return parser.parse(document, _validate);
253     }
254 
255     /***
256      * Creates and sets up a org.jdom.input.SAXBuilder for parsing.
257      * 
258      * @return a new org.jdom.input.SAXBuilder object
259      */
260     protected SAXBuilder createSAXBuilder() {
261         SAXBuilder saxBuilder = new SAXBuilder(_validate);        
262         saxBuilder.setEntityResolver(RESOLVER);
263 
264         //
265         // This code is needed to fix the security problem outlined in http://www.securityfocus.com/archive/1/297714
266         //
267         // Unfortunately there isn't an easy way to check if an XML parser supports a particular feature, so
268         // we need to set it and catch the exception if it fails. We also need to subclass the JDom SAXBuilder 
269         // class in order to get access to the underlying SAX parser - otherwise the features don't get set until
270         // we are already building the document, by which time it's too late to fix the problem.
271         //
272         // Crimson is one parser which is known not to support these features.
273 		try {
274 			XMLReader parser = saxBuilder.createParser();
275 			try {				
276 				parser.setFeature("http://xml.org/sax/features/external-general-entities", false);
277 				saxBuilder.setFeature("http://xml.org/sax/features/external-general-entities", false);
278 			} catch (SAXNotRecognizedException e) {
279 				// ignore
280 			} catch (SAXNotSupportedException e) {
281 				// ignore
282 			}
283 			
284 			try {
285 				parser.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
286 				saxBuilder.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
287 			} catch (SAXNotRecognizedException e) {
288 				// ignore
289 			} catch (SAXNotSupportedException e) {
290 				// ignore
291 			}
292 
293 			try {
294 				parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
295 				saxBuilder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
296 			} catch (SAXNotRecognizedException e) {
297 				// ignore
298 			} catch (SAXNotSupportedException e) {
299 				// ignore
300 			}
301 			
302 		} catch (JDOMException e) {
303 			throw new IllegalStateException("JDOM could not create a SAX parser");
304 		}
305 
306 		saxBuilder.setExpandEntities(false);    
307         return saxBuilder;
308     }
309 }