View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.fetcher.impl;
18  
19  import java.io.BufferedInputStream;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.net.HttpURLConnection;
23  import java.net.URL;
24  import java.net.URLConnection;
25  import java.util.zip.GZIPInputStream;
26  
27  import com.sun.syndication.feed.synd.SyndFeed;
28  import com.sun.syndication.fetcher.FetcherEvent;
29  import com.sun.syndication.fetcher.FetcherException;
30  import com.sun.syndication.io.FeedException;
31  import com.sun.syndication.io.SyndFeedInput;
32  import com.sun.syndication.io.XmlReader;
33  
34  /***
35   * <p>Class to retrieve syndication files via HTTP.</p>
36   *
37   * <p>If passed a {@link com.sun.syndication.fetcher.impl.FeedFetcherCache} in the
38   * constructor it will use conditional gets to only retrieve modified content.</p>
39   *
40   * <p>The class uses the Accept-Encoding: gzip header to retrieve gzipped feeds where
41   * supported by the server.</p>
42   *
43   * <p>Simple usage:
44   * <pre>
45   * 	// create the cache
46   *	FeedFetcherCache feedInfoCache = HashMapFeedInfoCache.getFeedInfoCache();
47   *	// retrieve the feed the first time
48   *	// any subsequent request will use conditional gets and only
49   *	// retrieve the resource if it has changed
50   *	SyndFeed feed = new HttpURLFeedFetcher(feedInfoCache).retrieveFeed(feedUrl);
51   *</pre>
52   *
53   * </p>
54   *
55   * @see <a href="http://fishbowl.pastiche.org/2002/10/21/http_conditional_get_for_rss_hackers">http://fishbowl.pastiche.org/2002/10/21/http_conditional_get_for_rss_hackers</a>
56   * @see <a href="http://diveintomark.org/archives/2003/07/21/atom_aggregator_behavior_http_level">http://diveintomark.org/archives/2003/07/21/atom_aggregator_behavior_http_level</a>
57   * @see <a href="http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html">http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html</a>
58   * @author Nick Lothian
59   */
60  public class HttpURLFeedFetcher extends AbstractFeedFetcher {
61  	static final int POLL_EVENT = 1;
62  	static final int RETRIEVE_EVENT = 2;
63  	static final int UNCHANGED_EVENT = 3;
64  
65  	private FeedFetcherCache feedInfoCache;
66  
67  
68  	/***
69  	 * Constructor to use HttpURLFeedFetcher without caching of feeds
70  	 *
71  	 */
72  	public HttpURLFeedFetcher() {
73  		super();
74  	}
75  
76  	/***
77  	 * Constructor to enable HttpURLFeedFetcher to cache feeds
78  	 *
79  	 * @param feedCache - an instance of the FeedFetcherCache interface
80  	 */
81  	public HttpURLFeedFetcher(FeedFetcherCache feedInfoCache) {
82  		this();
83  		setFeedInfoCache(feedInfoCache);
84  	}
85  
86  	/***
87  	 * Retrieve a feed over HTTP
88  	 *
89  	 * @param feedUrl A non-null URL of a RSS/Atom feed to retrieve
90  	 * @return A {@link com.sun.syndication.feed.synd.SyndFeed} object
91  	 * @throws IllegalArgumentException if the URL is null;
92  	 * @throws IOException if a TCP error occurs
93  	 * @throws FeedException if the feed is not valid
94  	 * @throws FetcherException if a HTTP error occurred
95  	 */
96  	public SyndFeed retrieveFeed(URL feedUrl) throws IllegalArgumentException, IOException, FeedException, FetcherException {
97  		if (feedUrl == null) {
98  			throw new IllegalArgumentException("null is not a valid URL");
99  		}
100 		
101 		URLConnection connection = feedUrl.openConnection();
102 		if (!(connection instanceof HttpURLConnection)) {		    
103 		    throw new IllegalArgumentException(feedUrl.toExternalForm() + " is not a valid HTTP Url");
104 		}
105 		HttpURLConnection httpConnection = (HttpURLConnection)connection;		
106 		// httpConnection.setInstanceFollowRedirects(true); // this is true by default, but can be changed on a claswide basis		
107 		
108 		FeedFetcherCache cache = getFeedInfoCache();
109 		if (cache != null) {
110 			SyndFeedInfo syndFeedInfo = cache.getFeedInfo(feedUrl);
111 			setRequestHeaders(connection, syndFeedInfo);
112 			httpConnection.connect();
113 			try {
114 				fireEvent(FetcherEvent.EVENT_TYPE_FEED_POLLED, connection);
115 								
116 				if (syndFeedInfo == null) {
117 					// this is a feed that hasn't been retrieved
118 					syndFeedInfo = new SyndFeedInfo();
119 					retrieveAndCacheFeed(feedUrl, syndFeedInfo, httpConnection);
120 				} else {
121 					// check the response code
122 					int responseCode = httpConnection.getResponseCode();
123 					if (responseCode != HttpURLConnection.HTTP_NOT_MODIFIED) {
124 						// the response code is not 304 NOT MODIFIED
125 						// This is either because the feed server
126 						// does not support condition gets
127 						// or because the feed hasn't changed
128 						retrieveAndCacheFeed(feedUrl, syndFeedInfo, httpConnection);
129 					} else {
130 						// the feed does not need retrieving
131 						fireEvent(FetcherEvent.EVENT_TYPE_FEED_UNCHANGED, connection);
132 					}
133 				}
134 	
135 				return syndFeedInfo.getSyndFeed();
136 			} finally {
137 			    httpConnection.disconnect();
138 			}
139 		} else {			
140 			fireEvent(FetcherEvent.EVENT_TYPE_FEED_POLLED, connection);
141 			InputStream inputStream = null;
142 			setRequestHeaders(connection, null);
143 			httpConnection.connect();
144 			try {
145 				inputStream = httpConnection.getInputStream();						
146 				return getSyndFeedFromStream(inputStream, connection);
147 			} catch (java.io.IOException e) {
148 				handleErrorCodes(((HttpURLConnection)connection).getResponseCode());
149 			} finally {
150 			    if (inputStream != null) {
151 			        inputStream.close();
152 			    }
153 			    httpConnection.disconnect();
154 			}
155 			// we will never actually get to this line
156 			return null;
157 		}
158 	}
159 
160 	protected void retrieveAndCacheFeed(URL feedUrl, SyndFeedInfo syndFeedInfo, HttpURLConnection connection) throws IllegalArgumentException, FeedException, FetcherException, IOException {
161 		handleErrorCodes(connection.getResponseCode());		
162 
163 		resetFeedInfo(feedUrl, syndFeedInfo, connection);
164 		FeedFetcherCache cache = getFeedInfoCache();
165 		// resetting feed info in the cache
166 		// could be needed for some implementations
167 		// of FeedFetcherCache (eg, distributed HashTables)
168 		if (cache != null) {
169 			cache.setFeedInfo(feedUrl, syndFeedInfo);
170 		}
171 	}
172 
173 	protected void resetFeedInfo(URL orignalUrl, SyndFeedInfo syndFeedInfo, HttpURLConnection connection) throws IllegalArgumentException, IOException, FeedException {
174 		// need to always set the URL because this may have changed due to 3xx redirects
175 		syndFeedInfo.setUrl(connection.getURL());
176 
177 		// the ID is a persistant value that should stay the same even if the URL for the
178 		// feed changes (eg, by 3xx redirects)
179 		syndFeedInfo.setId(orignalUrl.toString());
180 
181 		// This will be 0 if the server doesn't support or isn't setting the last modified header
182 		syndFeedInfo.setLastModified(new Long(connection.getLastModified()));
183 
184 		// This will be null if the server doesn't support or isn't setting the ETag header
185 		syndFeedInfo.setETag(connection.getHeaderField("ETag"));
186 
187 		// get the contents
188 		InputStream inputStream = null;
189 		try {
190 			inputStream = connection.getInputStream();
191 			SyndFeed syndFeed = getSyndFeedFromStream(inputStream, connection);
192 			
193 			String imHeader = connection.getHeaderField("IM");			
194 			if (isUsingDeltaEncoding() && (imHeader!= null && imHeader.indexOf("feed") >= 0)) {
195 				FeedFetcherCache cache = getFeedInfoCache();
196 				if (cache != null && connection.getResponseCode() == 226) {
197 				    // client is setup to use http delta encoding and the server supports it and has returned a delta encoded response
198 				    // This response only includes new items
199 				    SyndFeedInfo cachedInfo = cache.getFeedInfo(orignalUrl);
200 				    if (cachedInfo != null) {
201 					    SyndFeed cachedFeed = cachedInfo.getSyndFeed();
202 					    
203 					    // set the new feed to be the orginal feed plus the new items
204 					    syndFeed = combineFeeds(cachedFeed, syndFeed);			        
205 				    }
206 				}
207 			}
208 			
209 			syndFeedInfo.setSyndFeed(syndFeed);
210 		} finally {
211 			if (inputStream != null) {
212 				inputStream.close();
213 			}
214 		}
215 	}
216 
217 	/***
218 	 * <p>Set appropriate HTTP headers, including conditional get and gzip encoding headers</p>
219 	 *
220 	 * @param connection A URLConnection
221 	 * @param syndFeedInfo The SyndFeedInfo for the feed to be retrieved. May be null
222 	 */
223 	protected void setRequestHeaders(URLConnection connection, SyndFeedInfo syndFeedInfo) {
224 		if (syndFeedInfo != null) {
225 			// set the headers to get feed only if modified
226 			// we support the use of both last modified and eTag headers
227 			if (syndFeedInfo.getLastModified() != null) {			    
228 			    Object lastModified = syndFeedInfo.getLastModified();
229 			    if (lastModified instanceof Long) {
230 			        connection.setIfModifiedSince(((Long)syndFeedInfo.getLastModified()).longValue());
231 			    }				
232 			}
233 			if (syndFeedInfo.getETag() != null) {
234 				connection.setRequestProperty("If-None-Match", syndFeedInfo.getETag());
235 			}
236 
237 		}
238 		// header to retrieve feed gzipped
239 		connection.setRequestProperty("Accept-Encoding", "gzip");
240 
241 		// set the user agent
242 		connection.addRequestProperty("User-Agent", getUserAgent());	
243 		
244 		if (isUsingDeltaEncoding()) {
245 		    connection.addRequestProperty("A-IM", "feed");
246 		}		
247 	}
248 
249 	private static SyndFeed readSyndFeedFromStream(InputStream inputStream, URLConnection connection) throws IOException, IllegalArgumentException, FeedException {
250 		BufferedInputStream is;
251 		if ("gzip".equalsIgnoreCase(connection.getContentEncoding())) {
252 			// handle gzip encoded content
253 			is = new BufferedInputStream(new GZIPInputStream(inputStream));
254 		} else {
255 			is = new BufferedInputStream(inputStream);
256 		}
257 
258 		//InputStreamReader reader = new InputStreamReader(is, ResponseHandler.getCharacterEncoding(connection));
259 
260 		//SyndFeedInput input = new SyndFeedInput();
261 
262 	    XmlReader reader = null;	    
263 	    if (connection.getHeaderField("Content-Type") != null) {
264 	        reader = new XmlReader(is, connection.getHeaderField("Content-Type"), true);
265 	    } else {
266 	        reader = new XmlReader(is, true);
267 	    }
268 	    
269 		return new SyndFeedInput().build(reader);
270 	}
271 
272 	private SyndFeed getSyndFeedFromStream(InputStream inputStream, URLConnection connection) throws IOException, IllegalArgumentException, FeedException {
273 		SyndFeed feed = readSyndFeedFromStream(inputStream, connection);
274 		fireEvent(FetcherEvent.EVENT_TYPE_FEED_RETRIEVED, connection, feed);
275 		return feed;
276 	}
277 
278 	/***
279 	 * @return The FeedFetcherCache used by this fetcher (Could be null)
280 	 */
281 	public synchronized FeedFetcherCache getFeedInfoCache() {
282 		return feedInfoCache;
283 	}
284 
285 	/***
286 	 * @param cache The cache to be used by this fetcher (pass null to stop using a cache)
287 	 */
288 	public synchronized void setFeedInfoCache(FeedFetcherCache cache) {
289 		feedInfoCache = cache;
290 	}
291 }