View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.fetcher.impl;
18  
19  import java.io.BufferedInputStream;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.InputStreamReader;
23  import java.net.HttpURLConnection;
24  import java.net.URL;
25  import java.net.URLConnection;
26  import java.util.zip.GZIPInputStream;
27  
28  import com.sun.syndication.feed.synd.SyndFeedI;
29  import com.sun.syndication.fetcher.FetcherEvent;
30  import com.sun.syndication.fetcher.FetcherException;
31  import com.sun.syndication.io.FeedException;
32  import com.sun.syndication.io.SyndFeedInput;
33  
34  /***
35   * <p>Class to retrieve syndication files via HTTP.</p>
36   * 
37   * <p>If passed a {@link com.sun.syndication.fetcher.impl.FeedFetcherCacheI} in the
38   * constructor it will use conditional gets to only retrieve modified content.</p> 
39   * 
40   * <p>The class uses the Accept-Encoding: gzip header to retrieve gzipped feeds where
41   * supported by the server.</p>
42   * 
43   * <p>Simple usage:
44   * <pre>
45   * 	// create the cache
46   *	FeedFetcherCacheI feedInfoCache = HashMapFeedInfoCache.getFeedInfoCache();
47   *	// retrieve the feed the first time
48   *	// any subsequent request will use conditional gets and only
49   *	// retrieve the resource if it has changed
50   *	SyndFeedI feed = new HttpURLFeedFetcher(feedInfoCache).retrieveFeed(feedUrl);
51   *</pre>
52   *
53   * </p>
54   * 
55   * @see <a href="http://fishbowl.pastiche.org/2002/10/21/http_conditional_get_for_rss_hackers">http://fishbowl.pastiche.org/2002/10/21/http_conditional_get_for_rss_hackers</a>
56   * @see <a href="http://diveintomark.org/archives/2003/07/21/atom_aggregator_behavior_http_level">http://diveintomark.org/archives/2003/07/21/atom_aggregator_behavior_http_level</a>
57   * @author Nick Lothian
58   */
59  public class HttpURLFeedFetcher extends AbstractFeedFetcher {
60  	static final int POLL_EVENT = 1;
61  	static final int RETRIEVE_EVENT = 2;
62  	static final int UNCHANGED_EVENT = 3;
63  		
64  	private FeedFetcherCacheI feedInfoCache;
65  
66  	
67  	/***
68  	 * Constructor to use HttpURLFeedFetcher without caching of feeds
69  	 *
70  	 */
71  	public HttpURLFeedFetcher() {
72  		super();
73  	}
74  	
75  	/***
76  	 * Constructor to enable HttpURLFeedFetcher to cache feeds
77  	 * 
78  	 * @param feedCache - an instance of the FeedFetcherCacheI interface
79  	 */
80  	public HttpURLFeedFetcher(FeedFetcherCacheI feedCache) {
81  		this();
82  		feedInfoCache = feedCache;
83  	}
84  	
85  	/***
86  	 * Retrieve a feed over HTTP
87  	 * 
88  	 * @param feedUrl A non-null URL of a RSS/Atom feed to retrieve
89  	 * @return A {@link com.sun.syndication.feed.synd.SyndFeedI} object
90  	 * @throws IllegalArgumentException if the URL is null;
91  	 * @throws IOException if a TCP error occurs
92  	 * @throws FeedException if the feed is not valid
93  	 * @throws FetcherException if a HTTP error occurred
94  	 */
95  	public SyndFeedI retrieveFeed(URL feedUrl) throws IllegalArgumentException, IOException, FeedException, FetcherException {
96  		if (feedUrl == null) {
97  			throw new IllegalArgumentException("null is not a valid URL");
98  		}						
99  
100 		URLConnection connection = feedUrl.openConnection();
101 		if (feedInfoCache != null) {
102 			SyndFeedInfo syndFeedInfo = feedInfoCache.getFeedInfo(feedUrl);			
103 			setRequestHeaders(connection, syndFeedInfo);
104 			connection.connect();
105 			fireEvent(FetcherEvent.EVENT_TYPE_FEED_POLLED, connection);
106 			if (connection instanceof HttpURLConnection) {
107 				HttpURLConnection httpConnection = (HttpURLConnection)connection;
108 				//httpConnection.setInstanceFollowRedirects(true); // this is true by default, but can be changed on a claswide basis				
109 				if (syndFeedInfo == null) {
110 					// this is a feed that hasn't been retrieved					
111 					syndFeedInfo = new SyndFeedInfo();	
112 					retrieveAndCacheFeed(feedUrl, syndFeedInfo, connection);								
113 				} else {
114 					// check the response code
115 					int responseCode = httpConnection.getResponseCode();									
116 					if (responseCode != HttpURLConnection.HTTP_NOT_MODIFIED) {
117 						// the response code is not 304 NOT MODIFIED
118 						// This is either because the feed server
119 						// does not support condition gets
120 						// or because the feed hasn't changed 						
121 						retrieveAndCacheFeed(feedUrl, syndFeedInfo, connection);							
122 					} else {
123 						// the feed does not need retrieving
124 						fireEvent(FetcherEvent.EVENT_TYPE_FEED_UNCHANGED, connection);						
125 					}
126 				}
127 			} else {				
128 				fireEvent(FetcherEvent.EVENT_TYPE_FEED_RETRIEVED, connection);				
129 			}				
130 			
131 			return syndFeedInfo.getSyndFeed();		
132 		} else {
133 			fireEvent(FetcherEvent.EVENT_TYPE_FEED_POLLED, connection);
134 			try {
135 				InputStream inputStream = feedUrl.openStream(); 
136 				return getSyndFeedFromStream(inputStream, connection);
137 			} catch (java.io.IOException e) {				
138 				handleErrorCodes(((HttpURLConnection)connection).getResponseCode());				
139 			}
140 			// we will never actually get to this line
141 			return null; 
142 		}
143 	}
144 
145 	/***
146 	 * <p>Handles HTTP error codes.</p>
147 	 * 
148 	 * @param responseCode the HTTP response code
149 	 * @throws FetcherException if response code is in the range 400 to 599 inclusive
150 	 */
151 	protected void handleErrorCodes(int responseCode) throws FetcherException {
152 		// Handle 2xx codes as OK, so ignore them here 
153 		// 3xx codes are handled by the HttpURLConnection class		
154 		if (responseCode >= 400 && responseCode < 500) {
155 			throw4XXError(responseCode);
156 		} else if (responseCode >= 500 && responseCode < 600) {
157 			throw new FetcherException(responseCode, "The server encounted an error");
158 		}		
159 	}
160 
161 	private void throw4XXError(int responseCode) throws FetcherException {		
162 		throw new FetcherException(responseCode, "The requested resource could not be found");
163 	}
164 
165 	protected void retrieveAndCacheFeed(URL feedUrl, SyndFeedInfo syndFeedInfo, URLConnection connection) throws IllegalArgumentException, FeedException, FetcherException, IOException {
166 		if (connection instanceof HttpURLConnection) {
167 			HttpURLConnection httpConnection = (HttpURLConnection)connection;
168 			handleErrorCodes(httpConnection.getResponseCode());
169 		}
170 		
171 		resetFeedInfo(feedUrl, syndFeedInfo, connection);			
172 		// resetting feed info in the cache
173 		// could be needed for some implementations 
174 		// of FeedFetcherCacheI (eg, distributed HashTables)
175 		if (feedInfoCache != null) {
176 			feedInfoCache.setFeedInfo(feedUrl, syndFeedInfo);			
177 		}		
178 	}
179 
180 	protected void resetFeedInfo(URL orignalUrl, SyndFeedInfo syndFeedInfo, URLConnection connection) throws IllegalArgumentException, IOException, FeedException {
181 		// need to always set the URL because this may have changed due to 3xx redirects 
182 		syndFeedInfo.setUrl(connection.getURL());	
183 			
184 		// the ID is a persistant value that should stay the same even if the URL for the 
185 		// feed changes (eg, by 3xx redirects)		
186 		syndFeedInfo.setId(orignalUrl.toString()); 
187 			
188 		// This will be 0 if the server doesn't support or isn't setting the last modified header
189 		syndFeedInfo.setLastModified(connection.getLastModified()); 
190 				
191 		// This will be null if the server doesn't support or isn't setting the ETag header
192 		syndFeedInfo.setETag(connection.getHeaderField("ETag"));
193 						
194 		// get the contents
195 		InputStream inputStream = null;
196 		try {
197 			inputStream = connection.getInputStream();			
198 			SyndFeedI syndFeed = getSyndFeedFromStream(inputStream, connection);		
199 			syndFeedInfo.setSyndFeed(syndFeed);			
200 		} finally {
201 			if (inputStream != null) {
202 				inputStream.close();
203 			}			
204 		}
205 	}
206 
207 	/***
208 	 * <p>Set appropriate HTTP headers, including conditional get and gzip encoding headers</p>
209 	 * 
210 	 * @param connection A URLConnection
211 	 * @param syndFeedInfo The SyndFeedInfo for the feed to be retrieved. May be null
212 	 */
213 	protected void setRequestHeaders(URLConnection connection, SyndFeedInfo syndFeedInfo) {				
214 		if (syndFeedInfo != null) {
215 			// set the headers to get feed only if modified
216 			// we support the use of both last modified and eTag headers
217 			if (syndFeedInfo.getLastModified() != 0) {
218 				connection.setIfModifiedSince(syndFeedInfo.getLastModified());
219 			}
220 			if (syndFeedInfo.getETag() != null) {
221 				connection.setRequestProperty("If-None-Match", syndFeedInfo.getETag());
222 			}
223 						
224 		}
225 		// header to retrieve feed gzipped
226 		connection.setRequestProperty("Accept-Encoding", "gzip");
227 		
228 		// set the user agent
229 		connection.addRequestProperty("User-Agent", getUserAgent()); 
230 	}
231 
232 	private SyndFeedI getSyndFeedFromStream(InputStream inputStream, URLConnection connection) throws IOException, IllegalArgumentException, FeedException {						
233 		BufferedInputStream is;
234 		if ("gzip".equalsIgnoreCase(connection.getContentEncoding())) {
235 			// handle gzip encoded content			
236 			is = new BufferedInputStream(new GZIPInputStream(inputStream));
237 		} else {
238 			is = new BufferedInputStream(inputStream);
239 		}
240 
241 		InputStreamReader reader = new InputStreamReader(is, ResponseHandler.getCharacterEncoding(connection));
242 		
243 		SyndFeedInput input = new SyndFeedInput();
244 		
245 		SyndFeedI feed = input.build(reader);		
246 		fireEvent(FetcherEvent.EVENT_TYPE_FEED_RETRIEVED, connection);
247 		return feed;
248 	}
249 
250 	/***
251 	 * @return The FeedFetcherCacheI used by this fetcher (Could be null)
252 	 */
253 	public FeedFetcherCacheI getFeedInfoCache() {
254 		return feedInfoCache;
255 	}
256 
257 	/***
258 	 * @param cache The cache to be used by this fetcher (pass null to stop using a cache)
259 	 */
260 	public void setFeedInfoCache(FeedFetcherCacheI cache) {
261 		feedInfoCache = cache;
262 	}
263 }