View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.fetcher.impl;
18  
19  import java.io.BufferedInputStream;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.InputStreamReader;
23  import java.net.HttpURLConnection;
24  import java.net.URL;
25  import java.net.URLConnection;
26  import java.util.zip.GZIPInputStream;
27  
28  import com.sun.syndication.feed.synd.SyndFeed;
29  import com.sun.syndication.fetcher.FetcherEvent;
30  import com.sun.syndication.fetcher.FetcherException;
31  import com.sun.syndication.io.FeedException;
32  import com.sun.syndication.io.SyndFeedInput;
33  
34  /***
35   * <p>Class to retrieve syndication files via HTTP.</p>
36   *
37   * <p>If passed a {@link com.sun.syndication.fetcher.impl.FeedFetcherCache} in the
38   * constructor it will use conditional gets to only retrieve modified content.</p>
39   *
40   * <p>The class uses the Accept-Encoding: gzip header to retrieve gzipped feeds where
41   * supported by the server.</p>
42   *
43   * <p>Simple usage:
44   * <pre>
45   * 	// create the cache
46   *	FeedFetcherCache feedInfoCache = HashMapFeedInfoCache.getFeedInfoCache();
47   *	// retrieve the feed the first time
48   *	// any subsequent request will use conditional gets and only
49   *	// retrieve the resource if it has changed
50   *	SyndFeed feed = new HttpURLFeedFetcher(feedInfoCache).retrieveFeed(feedUrl);
51   *</pre>
52   *
53   * </p>
54   *
55   * @see <a href="http://fishbowl.pastiche.org/2002/10/21/http_conditional_get_for_rss_hackers">http://fishbowl.pastiche.org/2002/10/21/http_conditional_get_for_rss_hackers</a>
56   * @see <a href="http://diveintomark.org/archives/2003/07/21/atom_aggregator_behavior_http_level">http://diveintomark.org/archives/2003/07/21/atom_aggregator_behavior_http_level</a>
57   * @author Nick Lothian
58   */
59  public class HttpURLFeedFetcher extends AbstractFeedFetcher {
60  	static final int POLL_EVENT = 1;
61  	static final int RETRIEVE_EVENT = 2;
62  	static final int UNCHANGED_EVENT = 3;
63  
64  	private FeedFetcherCache feedInfoCache;
65  
66  
67  	/***
68  	 * Constructor to use HttpURLFeedFetcher without caching of feeds
69  	 *
70  	 */
71  	public HttpURLFeedFetcher() {
72  		super();
73  	}
74  
75  	/***
76  	 * Constructor to enable HttpURLFeedFetcher to cache feeds
77  	 *
78  	 * @param feedCache - an instance of the FeedFetcherCache interface
79  	 */
80  	public HttpURLFeedFetcher(FeedFetcherCache feedCache) {
81  		this();
82  		feedInfoCache = feedCache;
83  	}
84  
85  	/***
86  	 * Retrieve a feed over HTTP
87  	 *
88  	 * @param feedUrl A non-null URL of a RSS/Atom feed to retrieve
89  	 * @return A {@link com.sun.syndication.feed.synd.SyndFeed} object
90  	 * @throws IllegalArgumentException if the URL is null;
91  	 * @throws IOException if a TCP error occurs
92  	 * @throws FeedException if the feed is not valid
93  	 * @throws FetcherException if a HTTP error occurred
94  	 */
95  	public SyndFeed retrieveFeed(URL feedUrl) throws IllegalArgumentException, IOException, FeedException, FetcherException {
96  		if (feedUrl == null) {
97  			throw new IllegalArgumentException("null is not a valid URL");
98  		}
99  
100 		URLConnection connection = feedUrl.openConnection();
101 		if (feedInfoCache != null) {
102 			SyndFeedInfo syndFeedInfo = feedInfoCache.getFeedInfo(feedUrl);
103 			setRequestHeaders(connection, syndFeedInfo);
104 			connection.connect();
105 			fireEvent(FetcherEvent.EVENT_TYPE_FEED_POLLED, connection);
106 			if (connection instanceof HttpURLConnection) {
107 				HttpURLConnection httpConnection = (HttpURLConnection)connection;
108 				//httpConnection.setInstanceFollowRedirects(true); // this is true by default, but can be changed on a claswide basis
109 				if (syndFeedInfo == null) {
110 					// this is a feed that hasn't been retrieved
111 					syndFeedInfo = new SyndFeedInfo();
112 					retrieveAndCacheFeed(feedUrl, syndFeedInfo, connection);
113 				} else {
114 					// check the response code
115 					int responseCode = httpConnection.getResponseCode();
116 					if (responseCode != HttpURLConnection.HTTP_NOT_MODIFIED) {
117 						// the response code is not 304 NOT MODIFIED
118 						// This is either because the feed server
119 						// does not support condition gets
120 						// or because the feed hasn't changed
121 						retrieveAndCacheFeed(feedUrl, syndFeedInfo, connection);
122 					} else {
123 						// the feed does not need retrieving
124 						fireEvent(FetcherEvent.EVENT_TYPE_FEED_UNCHANGED, connection);
125 					}
126 				}
127 			} else {
128 				fireEvent(FetcherEvent.EVENT_TYPE_FEED_RETRIEVED, connection);
129 			}
130 
131 			return syndFeedInfo.getSyndFeed();
132 		} else {
133 			fireEvent(FetcherEvent.EVENT_TYPE_FEED_POLLED, connection);
134 			try {
135 				InputStream inputStream = feedUrl.openStream();
136 				return getSyndFeedFromStream(inputStream, connection);
137 			} catch (java.io.IOException e) {
138 				handleErrorCodes(((HttpURLConnection)connection).getResponseCode());
139 			}
140 			// we will never actually get to this line
141 			return null;
142 		}
143 	}
144 
145 	protected void retrieveAndCacheFeed(URL feedUrl, SyndFeedInfo syndFeedInfo, URLConnection connection) throws IllegalArgumentException, FeedException, FetcherException, IOException {
146 		if (connection instanceof HttpURLConnection) {
147 			HttpURLConnection httpConnection = (HttpURLConnection)connection;
148 			handleErrorCodes(httpConnection.getResponseCode());
149 		}
150 
151 		resetFeedInfo(feedUrl, syndFeedInfo, connection);
152 		// resetting feed info in the cache
153 		// could be needed for some implementations
154 		// of FeedFetcherCache (eg, distributed HashTables)
155 		if (feedInfoCache != null) {
156 			feedInfoCache.setFeedInfo(feedUrl, syndFeedInfo);
157 		}
158 	}
159 
160 	protected void resetFeedInfo(URL orignalUrl, SyndFeedInfo syndFeedInfo, URLConnection connection) throws IllegalArgumentException, IOException, FeedException {
161 		// need to always set the URL because this may have changed due to 3xx redirects
162 		syndFeedInfo.setUrl(connection.getURL());
163 
164 		// the ID is a persistant value that should stay the same even if the URL for the
165 		// feed changes (eg, by 3xx redirects)
166 		syndFeedInfo.setId(orignalUrl.toString());
167 
168 		// This will be 0 if the server doesn't support or isn't setting the last modified header
169 		syndFeedInfo.setLastModified(new Long(connection.getLastModified()));
170 
171 		// This will be null if the server doesn't support or isn't setting the ETag header
172 		syndFeedInfo.setETag(connection.getHeaderField("ETag"));
173 
174 		// get the contents
175 		InputStream inputStream = null;
176 		try {
177 			inputStream = connection.getInputStream();
178 			SyndFeed syndFeed = getSyndFeedFromStream(inputStream, connection);
179 			syndFeedInfo.setSyndFeed(syndFeed);
180 		} finally {
181 			if (inputStream != null) {
182 				inputStream.close();
183 			}
184 		}
185 	}
186 
187 	/***
188 	 * <p>Set appropriate HTTP headers, including conditional get and gzip encoding headers</p>
189 	 *
190 	 * @param connection A URLConnection
191 	 * @param syndFeedInfo The SyndFeedInfo for the feed to be retrieved. May be null
192 	 */
193 	protected void setRequestHeaders(URLConnection connection, SyndFeedInfo syndFeedInfo) {
194 		if (syndFeedInfo != null) {
195 			// set the headers to get feed only if modified
196 			// we support the use of both last modified and eTag headers
197 			if (syndFeedInfo.getLastModified() != null) {			    
198 			    Object lastModified = syndFeedInfo.getLastModified();
199 			    if (lastModified instanceof Long) {
200 			        connection.setIfModifiedSince(((Long)syndFeedInfo.getLastModified()).longValue());
201 			    }				
202 			}
203 			if (syndFeedInfo.getETag() != null) {
204 				connection.setRequestProperty("If-None-Match", syndFeedInfo.getETag());
205 			}
206 
207 		}
208 		// header to retrieve feed gzipped
209 		connection.setRequestProperty("Accept-Encoding", "gzip");
210 
211 		// set the user agent
212 		connection.addRequestProperty("User-Agent", getUserAgent());
213 	}
214 
215 	private SyndFeed getSyndFeedFromStream(InputStream inputStream, URLConnection connection) throws IOException, IllegalArgumentException, FeedException {
216 		BufferedInputStream is;
217 		if ("gzip".equalsIgnoreCase(connection.getContentEncoding())) {
218 			// handle gzip encoded content
219 			is = new BufferedInputStream(new GZIPInputStream(inputStream));
220 		} else {
221 			is = new BufferedInputStream(inputStream);
222 		}
223 
224 		InputStreamReader reader = new InputStreamReader(is, ResponseHandler.getCharacterEncoding(connection));
225 
226 		SyndFeedInput input = new SyndFeedInput();
227 
228 		SyndFeed feed = input.build(reader);
229 		fireEvent(FetcherEvent.EVENT_TYPE_FEED_RETRIEVED, connection);
230 		return feed;
231 	}
232 
233 	/***
234 	 * @return The FeedFetcherCache used by this fetcher (Could be null)
235 	 */
236 	public FeedFetcherCache getFeedInfoCache() {
237 		return feedInfoCache;
238 	}
239 
240 	/***
241 	 * @param cache The cache to be used by this fetcher (pass null to stop using a cache)
242 	 */
243 	public void setFeedInfoCache(FeedFetcherCache cache) {
244 		feedInfoCache = cache;
245 	}
246 }