View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.fetcher.impl;
18  
19  import java.io.BufferedInputStream;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.net.HttpURLConnection;
23  import java.net.URL;
24  import java.net.URLConnection;
25  import java.util.zip.GZIPInputStream;
26  
27  import com.sun.syndication.feed.synd.SyndFeed;
28  import com.sun.syndication.fetcher.FetcherEvent;
29  import com.sun.syndication.fetcher.FetcherException;
30  import com.sun.syndication.io.FeedException;
31  import com.sun.syndication.io.SyndFeedInput;
32  import com.sun.syndication.io.XmlReader;
33  
34  /***
35   * <p>Class to retrieve syndication files via HTTP.</p>
36   *
37   * <p>If passed a {@link com.sun.syndication.fetcher.impl.FeedFetcherCache} in the
38   * constructor it will use conditional gets to only retrieve modified content.</p>
39   *
40   * <p>The class uses the Accept-Encoding: gzip header to retrieve gzipped feeds where
41   * supported by the server.</p>
42   *
43   * <p>Simple usage:
44   * <pre>
45   * 	// create the cache
46   *	FeedFetcherCache feedInfoCache = HashMapFeedInfoCache.getFeedInfoCache();
47   *	// retrieve the feed the first time
48   *	// any subsequent request will use conditional gets and only
49   *	// retrieve the resource if it has changed
50   *	SyndFeed feed = new HttpURLFeedFetcher(feedInfoCache).retrieveFeed(feedUrl);
51   *</pre>
52   *
53   * </p>
54   *
55   * @see <a href="http://fishbowl.pastiche.org/2002/10/21/http_conditional_get_for_rss_hackers">http://fishbowl.pastiche.org/2002/10/21/http_conditional_get_for_rss_hackers</a>
56   * @see <a href="http://diveintomark.org/archives/2003/07/21/atom_aggregator_behavior_http_level">http://diveintomark.org/archives/2003/07/21/atom_aggregator_behavior_http_level</a>
57   * @see <a href="http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html">http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html</a>
58   * @author Nick Lothian
59   */
60  public class HttpURLFeedFetcher extends AbstractFeedFetcher {
61  	static final int POLL_EVENT = 1;
62  	static final int RETRIEVE_EVENT = 2;
63  	static final int UNCHANGED_EVENT = 3;
64  
65  	private FeedFetcherCache feedInfoCache;
66  
67  
68  	/***
69  	 * Constructor to use HttpURLFeedFetcher without caching of feeds
70  	 *
71  	 */
72  	public HttpURLFeedFetcher() {
73  		super();
74  	}
75  
76  	/***
77  	 * Constructor to enable HttpURLFeedFetcher to cache feeds
78  	 *
79  	 * @param feedCache - an instance of the FeedFetcherCache interface
80  	 */
81  	public HttpURLFeedFetcher(FeedFetcherCache feedCache) {
82  		this();
83  		feedInfoCache = feedCache;
84  	}
85  
86  	/***
87  	 * Retrieve a feed over HTTP
88  	 *
89  	 * @param feedUrl A non-null URL of a RSS/Atom feed to retrieve
90  	 * @return A {@link com.sun.syndication.feed.synd.SyndFeed} object
91  	 * @throws IllegalArgumentException if the URL is null;
92  	 * @throws IOException if a TCP error occurs
93  	 * @throws FeedException if the feed is not valid
94  	 * @throws FetcherException if a HTTP error occurred
95  	 */
96  	public SyndFeed retrieveFeed(URL feedUrl) throws IllegalArgumentException, IOException, FeedException, FetcherException {
97  		if (feedUrl == null) {
98  			throw new IllegalArgumentException("null is not a valid URL");
99  		}
100 		
101 		URLConnection connection = feedUrl.openConnection();
102 		if (!(connection instanceof HttpURLConnection)) {		    
103 		    throw new IllegalArgumentException(feedUrl.toExternalForm() + " is not a valid HTTP Url");
104 		}
105 		HttpURLConnection httpConnection = (HttpURLConnection)connection;		
106 		// httpConnection.setInstanceFollowRedirects(true); // this is true by default, but can be changed on a claswide basis		
107 		
108 		if (feedInfoCache != null) {
109 			SyndFeedInfo syndFeedInfo = feedInfoCache.getFeedInfo(feedUrl);
110 			setRequestHeaders(connection, syndFeedInfo);
111 			httpConnection.connect();
112 			try {
113 				fireEvent(FetcherEvent.EVENT_TYPE_FEED_POLLED, connection);
114 								
115 				if (syndFeedInfo == null) {
116 					// this is a feed that hasn't been retrieved
117 					syndFeedInfo = new SyndFeedInfo();
118 					retrieveAndCacheFeed(feedUrl, syndFeedInfo, httpConnection);
119 				} else {
120 					// check the response code
121 					int responseCode = httpConnection.getResponseCode();
122 					if (responseCode != HttpURLConnection.HTTP_NOT_MODIFIED) {
123 						// the response code is not 304 NOT MODIFIED
124 						// This is either because the feed server
125 						// does not support condition gets
126 						// or because the feed hasn't changed
127 						retrieveAndCacheFeed(feedUrl, syndFeedInfo, httpConnection);
128 					} else {
129 						// the feed does not need retrieving
130 						fireEvent(FetcherEvent.EVENT_TYPE_FEED_UNCHANGED, connection);
131 					}
132 				}
133 	
134 				return syndFeedInfo.getSyndFeed();
135 			} finally {
136 			    httpConnection.disconnect();
137 			}
138 		} else {
139 			fireEvent(FetcherEvent.EVENT_TYPE_FEED_POLLED, connection);
140 			InputStream inputStream = null;
141 			try {
142 				inputStream = feedUrl.openStream();						
143 				return getSyndFeedFromStream(inputStream, connection);
144 			} catch (java.io.IOException e) {
145 				handleErrorCodes(((HttpURLConnection)connection).getResponseCode());
146 			} finally {
147 			    if (inputStream != null) {
148 			        inputStream.close();
149 			    }
150 			}
151 			// we will never actually get to this line
152 			return null;
153 		}
154 	}
155 
156 	protected void retrieveAndCacheFeed(URL feedUrl, SyndFeedInfo syndFeedInfo, HttpURLConnection connection) throws IllegalArgumentException, FeedException, FetcherException, IOException {
157 		handleErrorCodes(connection.getResponseCode());		
158 
159 		resetFeedInfo(feedUrl, syndFeedInfo, connection);
160 		// resetting feed info in the cache
161 		// could be needed for some implementations
162 		// of FeedFetcherCache (eg, distributed HashTables)
163 		if (feedInfoCache != null) {
164 			feedInfoCache.setFeedInfo(feedUrl, syndFeedInfo);
165 		}
166 	}
167 
168 	protected void resetFeedInfo(URL orignalUrl, SyndFeedInfo syndFeedInfo, HttpURLConnection connection) throws IllegalArgumentException, IOException, FeedException {
169 		// need to always set the URL because this may have changed due to 3xx redirects
170 		syndFeedInfo.setUrl(connection.getURL());
171 
172 		// the ID is a persistant value that should stay the same even if the URL for the
173 		// feed changes (eg, by 3xx redirects)
174 		syndFeedInfo.setId(orignalUrl.toString());
175 
176 		// This will be 0 if the server doesn't support or isn't setting the last modified header
177 		syndFeedInfo.setLastModified(new Long(connection.getLastModified()));
178 
179 		// This will be null if the server doesn't support or isn't setting the ETag header
180 		syndFeedInfo.setETag(connection.getHeaderField("ETag"));
181 
182 		// get the contents
183 		InputStream inputStream = null;
184 		try {
185 			inputStream = connection.getInputStream();
186 			SyndFeed syndFeed = getSyndFeedFromStream(inputStream, connection);
187 			
188 			String imHeader = connection.getHeaderField("IM");			
189 			if (isUsingDeltaEncoding() && (imHeader!= null && imHeader.indexOf("feed") >= 0) && (feedInfoCache != null) && connection.getResponseCode() == 226) {
190 			    // client is setup to use http delta encoding and the server supports it and has returned a delta encoded response
191 			    // This response only includes new items
192 			    SyndFeedInfo cachedInfo = feedInfoCache.getFeedInfo(orignalUrl);
193 			    if (cachedInfo != null) {
194 				    SyndFeed cachedFeed = cachedInfo.getSyndFeed();
195 				    
196 				    // set the new feed to be the orginal feed plus the new items
197 				    syndFeed = combineFeeds(cachedFeed, syndFeed);			        
198 			    }
199 			}
200 			
201 			syndFeedInfo.setSyndFeed(syndFeed);
202 		} finally {
203 			if (inputStream != null) {
204 				inputStream.close();
205 			}
206 		}
207 	}
208 
209 	/***
210 	 * <p>Set appropriate HTTP headers, including conditional get and gzip encoding headers</p>
211 	 *
212 	 * @param connection A URLConnection
213 	 * @param syndFeedInfo The SyndFeedInfo for the feed to be retrieved. May be null
214 	 */
215 	protected void setRequestHeaders(URLConnection connection, SyndFeedInfo syndFeedInfo) {
216 		if (syndFeedInfo != null) {
217 			// set the headers to get feed only if modified
218 			// we support the use of both last modified and eTag headers
219 			if (syndFeedInfo.getLastModified() != null) {			    
220 			    Object lastModified = syndFeedInfo.getLastModified();
221 			    if (lastModified instanceof Long) {
222 			        connection.setIfModifiedSince(((Long)syndFeedInfo.getLastModified()).longValue());
223 			    }				
224 			}
225 			if (syndFeedInfo.getETag() != null) {
226 				connection.setRequestProperty("If-None-Match", syndFeedInfo.getETag());
227 			}
228 
229 		}
230 		// header to retrieve feed gzipped
231 		connection.setRequestProperty("Accept-Encoding", "gzip");
232 
233 		// set the user agent
234 		connection.addRequestProperty("User-Agent", getUserAgent());	
235 		
236 		if (isUsingDeltaEncoding()) {
237 		    connection.addRequestProperty("A-IM", "feed");
238 		}		
239 	}
240 
241 	private SyndFeed getSyndFeedFromStream(InputStream inputStream, URLConnection connection) throws IOException, IllegalArgumentException, FeedException {
242 		BufferedInputStream is;
243 		if ("gzip".equalsIgnoreCase(connection.getContentEncoding())) {
244 			// handle gzip encoded content
245 			is = new BufferedInputStream(new GZIPInputStream(inputStream));
246 		} else {
247 			is = new BufferedInputStream(inputStream);
248 		}
249 
250 		//InputStreamReader reader = new InputStreamReader(is, ResponseHandler.getCharacterEncoding(connection));
251 
252 		//SyndFeedInput input = new SyndFeedInput();
253 
254 	    XmlReader reader = null;	    
255 	    if (connection.getHeaderField("Content-Type") != null) {
256 	        reader = new XmlReader(is, connection.getHeaderField("Content-Type"), true);
257 	    } else {
258 	        reader = new XmlReader(is, true);
259 	    }
260 	    
261 		SyndFeed feed = new SyndFeedInput().build(reader);		
262 		fireEvent(FetcherEvent.EVENT_TYPE_FEED_RETRIEVED, connection, feed);
263 		return feed;
264 	}
265 
266 	/***
267 	 * @return The FeedFetcherCache used by this fetcher (Could be null)
268 	 */
269 	public FeedFetcherCache getFeedInfoCache() {
270 		return feedInfoCache;
271 	}
272 
273 	/***
274 	 * @param cache The cache to be used by this fetcher (pass null to stop using a cache)
275 	 */
276 	public void setFeedInfoCache(FeedFetcherCache cache) {
277 		feedInfoCache = cache;
278 	}
279 }