View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.fetcher.impl;
18  
19  import java.io.BufferedInputStream;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.InputStreamReader;
23  import java.net.HttpURLConnection;
24  import java.net.URL;
25  import java.net.URLConnection;
26  import java.util.zip.GZIPInputStream;
27  
28  import com.sun.syndication.feed.synd.SyndFeed;
29  import com.sun.syndication.fetcher.FetcherEvent;
30  import com.sun.syndication.fetcher.FetcherException;
31  import com.sun.syndication.io.FeedException;
32  import com.sun.syndication.io.SyndFeedInput;
33  
34  /***
35   * <p>Class to retrieve syndication files via HTTP.</p>
36   *
37   * <p>If passed a {@link com.sun.syndication.fetcher.impl.FeedFetcherCache} in the
38   * constructor it will use conditional gets to only retrieve modified content.</p>
39   *
40   * <p>The class uses the Accept-Encoding: gzip header to retrieve gzipped feeds where
41   * supported by the server.</p>
42   *
43   * <p>Simple usage:
44   * <pre>
45   * 	// create the cache
46   *	FeedFetcherCache feedInfoCache = HashMapFeedInfoCache.getFeedInfoCache();
47   *	// retrieve the feed the first time
48   *	// any subsequent request will use conditional gets and only
49   *	// retrieve the resource if it has changed
50   *	SyndFeed feed = new HttpURLFeedFetcher(feedInfoCache).retrieveFeed(feedUrl);
51   *</pre>
52   *
53   * </p>
54   *
55   * @see <a href="http://fishbowl.pastiche.org/2002/10/21/http_conditional_get_for_rss_hackers">http://fishbowl.pastiche.org/2002/10/21/http_conditional_get_for_rss_hackers</a>
56   * @see <a href="http://diveintomark.org/archives/2003/07/21/atom_aggregator_behavior_http_level">http://diveintomark.org/archives/2003/07/21/atom_aggregator_behavior_http_level</a>
57   * @see <a href="http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html">http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html</a>
58   * @author Nick Lothian
59   */
60  public class HttpURLFeedFetcher extends AbstractFeedFetcher {
61  	static final int POLL_EVENT = 1;
62  	static final int RETRIEVE_EVENT = 2;
63  	static final int UNCHANGED_EVENT = 3;
64  
65  	private FeedFetcherCache feedInfoCache;
66  
67  
68  	/***
69  	 * Constructor to use HttpURLFeedFetcher without caching of feeds
70  	 *
71  	 */
72  	public HttpURLFeedFetcher() {
73  		super();
74  	}
75  
76  	/***
77  	 * Constructor to enable HttpURLFeedFetcher to cache feeds
78  	 *
79  	 * @param feedCache - an instance of the FeedFetcherCache interface
80  	 */
81  	public HttpURLFeedFetcher(FeedFetcherCache feedCache) {
82  		this();
83  		feedInfoCache = feedCache;
84  	}
85  
86  	/***
87  	 * Retrieve a feed over HTTP
88  	 *
89  	 * @param feedUrl A non-null URL of a RSS/Atom feed to retrieve
90  	 * @return A {@link com.sun.syndication.feed.synd.SyndFeed} object
91  	 * @throws IllegalArgumentException if the URL is null;
92  	 * @throws IOException if a TCP error occurs
93  	 * @throws FeedException if the feed is not valid
94  	 * @throws FetcherException if a HTTP error occurred
95  	 */
96  	public SyndFeed retrieveFeed(URL feedUrl) throws IllegalArgumentException, IOException, FeedException, FetcherException {
97  		if (feedUrl == null) {
98  			throw new IllegalArgumentException("null is not a valid URL");
99  		}
100 		
101 		URLConnection connection = feedUrl.openConnection();
102 		if (!(connection instanceof HttpURLConnection)) {		    
103 		    throw new IllegalArgumentException(feedUrl.toExternalForm() + " is not a valid HTTP Url");
104 		}
105 		HttpURLConnection httpConnection = (HttpURLConnection)connection;
106 		// httpConnection.setInstanceFollowRedirects(true); // this is true by default, but can be changed on a claswide basis
107 		
108 		if (feedInfoCache != null) {
109 			SyndFeedInfo syndFeedInfo = feedInfoCache.getFeedInfo(feedUrl);
110 			setRequestHeaders(connection, syndFeedInfo);
111 			connection.connect();
112 			fireEvent(FetcherEvent.EVENT_TYPE_FEED_POLLED, connection);
113 							
114 			if (syndFeedInfo == null) {
115 				// this is a feed that hasn't been retrieved
116 				syndFeedInfo = new SyndFeedInfo();
117 				retrieveAndCacheFeed(feedUrl, syndFeedInfo, httpConnection);
118 			} else {
119 				// check the response code
120 				int responseCode = httpConnection.getResponseCode();
121 				if (responseCode != HttpURLConnection.HTTP_NOT_MODIFIED) {
122 					// the response code is not 304 NOT MODIFIED
123 					// This is either because the feed server
124 					// does not support condition gets
125 					// or because the feed hasn't changed
126 					retrieveAndCacheFeed(feedUrl, syndFeedInfo, httpConnection);
127 				} else {
128 					// the feed does not need retrieving
129 					fireEvent(FetcherEvent.EVENT_TYPE_FEED_UNCHANGED, connection);
130 				}
131 			}
132 
133 			return syndFeedInfo.getSyndFeed();
134 		} else {
135 			fireEvent(FetcherEvent.EVENT_TYPE_FEED_POLLED, connection);
136 			try {
137 				InputStream inputStream = feedUrl.openStream();
138 				return getSyndFeedFromStream(inputStream, connection);
139 			} catch (java.io.IOException e) {
140 				handleErrorCodes(((HttpURLConnection)connection).getResponseCode());
141 			}
142 			// we will never actually get to this line
143 			return null;
144 		}
145 	}
146 
147 	protected void retrieveAndCacheFeed(URL feedUrl, SyndFeedInfo syndFeedInfo, HttpURLConnection connection) throws IllegalArgumentException, FeedException, FetcherException, IOException {
148 		handleErrorCodes(connection.getResponseCode());		
149 
150 		resetFeedInfo(feedUrl, syndFeedInfo, connection);
151 		// resetting feed info in the cache
152 		// could be needed for some implementations
153 		// of FeedFetcherCache (eg, distributed HashTables)
154 		if (feedInfoCache != null) {
155 			feedInfoCache.setFeedInfo(feedUrl, syndFeedInfo);
156 		}
157 	}
158 
159 	protected void resetFeedInfo(URL orignalUrl, SyndFeedInfo syndFeedInfo, HttpURLConnection connection) throws IllegalArgumentException, IOException, FeedException {
160 		// need to always set the URL because this may have changed due to 3xx redirects
161 		syndFeedInfo.setUrl(connection.getURL());
162 
163 		// the ID is a persistant value that should stay the same even if the URL for the
164 		// feed changes (eg, by 3xx redirects)
165 		syndFeedInfo.setId(orignalUrl.toString());
166 
167 		// This will be 0 if the server doesn't support or isn't setting the last modified header
168 		syndFeedInfo.setLastModified(new Long(connection.getLastModified()));
169 
170 		// This will be null if the server doesn't support or isn't setting the ETag header
171 		syndFeedInfo.setETag(connection.getHeaderField("ETag"));
172 
173 		// get the contents
174 		InputStream inputStream = null;
175 		try {
176 			inputStream = connection.getInputStream();
177 			SyndFeed syndFeed = getSyndFeedFromStream(inputStream, connection);
178 			
179 			String imHeader = connection.getHeaderField("IM");			
180 			if (isUsingDeltaEncoding() && (imHeader!= null && imHeader.indexOf("feed") >= 0) && (feedInfoCache != null) && connection.getResponseCode() == 226) {
181 			    // client is setup to use http delta encoding and the server supports it and has returned a delta encoded response
182 			    // This response only includes new items
183 			    SyndFeedInfo cachedInfo = feedInfoCache.getFeedInfo(orignalUrl);
184 			    if (cachedInfo != null) {
185 				    SyndFeed cachedFeed = cachedInfo.getSyndFeed();
186 				    
187 				    // set the new feed to be the orginal feed plus the new items
188 				    syndFeed = combineFeeds(cachedFeed, syndFeed);			        
189 			    }
190 			}
191 			
192 			syndFeedInfo.setSyndFeed(syndFeed);
193 		} finally {
194 			if (inputStream != null) {
195 				inputStream.close();
196 			}
197 		}
198 	}
199 
200 	/***
201 	 * <p>Set appropriate HTTP headers, including conditional get and gzip encoding headers</p>
202 	 *
203 	 * @param connection A URLConnection
204 	 * @param syndFeedInfo The SyndFeedInfo for the feed to be retrieved. May be null
205 	 */
206 	protected void setRequestHeaders(URLConnection connection, SyndFeedInfo syndFeedInfo) {
207 		if (syndFeedInfo != null) {
208 			// set the headers to get feed only if modified
209 			// we support the use of both last modified and eTag headers
210 			if (syndFeedInfo.getLastModified() != null) {			    
211 			    Object lastModified = syndFeedInfo.getLastModified();
212 			    if (lastModified instanceof Long) {
213 			        connection.setIfModifiedSince(((Long)syndFeedInfo.getLastModified()).longValue());
214 			    }				
215 			}
216 			if (syndFeedInfo.getETag() != null) {
217 				connection.setRequestProperty("If-None-Match", syndFeedInfo.getETag());
218 			}
219 
220 		}
221 		// header to retrieve feed gzipped
222 		connection.setRequestProperty("Accept-Encoding", "gzip");
223 
224 		// set the user agent
225 		connection.addRequestProperty("User-Agent", getUserAgent());	
226 		
227 		if (isUsingDeltaEncoding()) {
228 		    connection.addRequestProperty("A-IM", "feed");
229 		}		
230 	}
231 
232 	private SyndFeed getSyndFeedFromStream(InputStream inputStream, URLConnection connection) throws IOException, IllegalArgumentException, FeedException {
233 		BufferedInputStream is;
234 		if ("gzip".equalsIgnoreCase(connection.getContentEncoding())) {
235 			// handle gzip encoded content
236 			is = new BufferedInputStream(new GZIPInputStream(inputStream));
237 		} else {
238 			is = new BufferedInputStream(inputStream);
239 		}
240 
241 		InputStreamReader reader = new InputStreamReader(is, ResponseHandler.getCharacterEncoding(connection));
242 
243 		SyndFeedInput input = new SyndFeedInput();
244 
245 		SyndFeed feed = input.build(reader);
246 		fireEvent(FetcherEvent.EVENT_TYPE_FEED_RETRIEVED, connection);
247 		return feed;
248 	}
249 
250 	/***
251 	 * @return The FeedFetcherCache used by this fetcher (Could be null)
252 	 */
253 	public FeedFetcherCache getFeedInfoCache() {
254 		return feedInfoCache;
255 	}
256 
257 	/***
258 	 * @param cache The cache to be used by this fetcher (pass null to stop using a cache)
259 	 */
260 	public void setFeedInfoCache(FeedFetcherCache cache) {
261 		feedInfoCache = cache;
262 	}
263 }