enhancement for #181 - sets user-agent header on UrlConnection

2014-04-18 20:32:46 +02:00 · 2014-04-18 20:32:46 +02:00 · 95629798ba
commit 95629798ba
parent fee63dd5b1
2 changed files with 59 additions and 74 deletions
--- a/pom.xml
+++ b/pom.xml
@ -78,6 +78,19 @@
 	<build>
 		<plugins>
 			<plugin>
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-jar-plugin</artifactId>
 				<version>2.4</version>
 				<configuration>
 					<archive>
 						<manifest>
 							<addDefaultImplementationEntries>true</addDefaultImplementationEntries>
 							<addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
 						</manifest>
 					</archive>
 				</configuration>
 			</plugin>
 			<plugin>
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-site-plugin</artifactId>
--- a/src/main/java/com/sun/syndication/io/XmlReader.java
+++ b/src/main/java/com/sun/syndication/io/XmlReader.java
@ -34,25 +34,18 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 /**
- * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out the
+ * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out the charset encoding of the XML document within the stream.
 * charset encoding of the XML document within the stream.
 * <p>
- * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
+ * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
 * character stream.
 * <p>
- * All this has to be done without consuming characters from the stream, if not the XML parser will
+ * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
- * not recognized the document as a valid XML. This is not 100% true, but it's close enough (UTF-8
+ * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all parsers).
 * BOM is not handled by all parsers right now, XmlReader handles it and things work in all
 * parsers).
 * <p>
- * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP
+ * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
 * streams by offering a wide set of constructors.
 * <P>
- * By default the charset encoding detection is lenient, the constructor with the lenient flag can
+ * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script (following HTTP MIME and XML
- * be used for an script (following HTTP MIME and XML specifications). All this is nicely explained
+ * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining
- * by Mark Pilgrim in his blog, <a
+ * the character encoding of a feed</a>.
 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character
 * encoding of a feed</a>.
 * <p>
 * 
 * @author Alejandro Abdelnur
@ -87,11 +80,9 @@ public class XmlReader extends Reader {
    /**
     * Creates a Reader for a File.
     * <p>
-     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
+     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
     * missing defaults to UTF-8.
     * <p>
-     * It does a lenient charset encoding detection, check the constructor with the lenient
+     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
     * parameter for details.
     * <p>
     * 
     * @param file File to create a Reader from.
@ -107,8 +98,7 @@ public class XmlReader extends Reader {
     * <p>
     * It follows the same logic used for files.
     * <p>
-     * It does a lenient charset encoding detection, check the constructor with the lenient
+     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
     * parameter for details.
     * <p>
     * 
     * @param is InputStream to create a Reader from.
@ -120,16 +110,13 @@ public class XmlReader extends Reader {
    }
    /**
-     * Creates a Reader for a raw InputStream and uses the provided default encoding if none is
+     * Creates a Reader for a raw InputStream and uses the provided default encoding if none is determined.
     * determined.
     * <p>
     * It follows the same logic used for files.
     * <p>
-     * If lenient detection is indicated and the detection above fails as per specifications it then
+     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
     * attempts the following:
     * <p>
-     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection
+     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
     * again.
     * <p>
     * Else if the XML prolog had a charset encoding that encoding is used.
     * <p>
@ -144,8 +131,7 @@ public class XmlReader extends Reader {
     * @param lenient indicates if the charset encoding detection should be relaxed.
     * @param defaultEncoding default encoding to use if one cannot be detected.
     * @throws IOException thrown if there is a problem reading the stream.
-     * @throws XmlReaderException thrown if the charset encoding could not be determined according
+     * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
     *             to the specs.
     * 
     */
    public XmlReader(final InputStream is, final boolean lenient, final String defaultEncoding) throws IOException, XmlReaderException {
@ -170,11 +156,9 @@ public class XmlReader extends Reader {
     * <p>
     * It follows the same logic used for files.
     * <p>
-     * If lenient detection is indicated and the detection above fails as per specifications it then
+     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
     * attempts the following:
     * <p>
-     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection
+     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
     * again.
     * <p>
     * Else if the XML prolog had a charset encoding that encoding is used.
     * <p>
@ -188,8 +172,7 @@ public class XmlReader extends Reader {
     * @param is InputStream to create a Reader from.
     * @param lenient indicates if the charset encoding detection should be relaxed.
     * @throws IOException thrown if there is a problem reading the stream.
-     * @throws XmlReaderException thrown if the charset encoding could not be determined according
+     * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
     *             to the specs.
     * 
     */
    public XmlReader(final InputStream is, final boolean lenient) throws IOException, XmlReaderException {
@ -199,14 +182,11 @@ public class XmlReader extends Reader {
    /**
     * Creates a Reader using the InputStream of a URL.
     * <p>
-     * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it
+     * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
     * uses the same logic used for Files.
     * <p>
-     * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the
+     * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
     * same logic used for an InputStream with content-type.
     * <p>
-     * It does a lenient charset encoding detection, check the constructor with the lenient
+     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
     * parameter for details.
     * <p>
     * 
     * @param url URL to create a Reader from.
@ -220,14 +200,12 @@ public class XmlReader extends Reader {
    /**
     * Creates a Reader using the InputStream of a URLConnection.
     * <p>
-     * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header
+     * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
     * in the fetched data it uses the same logic used for files.
     * <p>
-     * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it
+     * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
-     * uses the same logic used for an InputStream with content-type.
+     * content-type.
     * <p>
-     * It does a lenient charset encoding detection, check the constructor with the lenient
+     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
     * parameter for details.
     * <p>
     * 
     * @param conn URLConnection to create a Reader from.
@ -238,6 +216,12 @@ public class XmlReader extends Reader {
        defaultEncoding = staticDefaultEncoding;
        final boolean lenient = true;
        if (conn instanceof HttpURLConnection) {
            final Package pckg = this.getClass().getPackage();
            if (pckg.getImplementationTitle() != null && pckg.getImplementationVersion() != null) {
                conn.setRequestProperty("User-Agent", pckg.getImplementationTitle() + "/" + pckg.getImplementationVersion());
            } else {
                conn.setRequestProperty("User-Agent", "ROME");
            }
            try {
                doHttpStream(conn.getInputStream(), conn.getContentType(), lenient);
            } catch (final XmlReaderException ex) {
@ -261,12 +245,10 @@ public class XmlReader extends Reader {
    /**
     * Creates a Reader using an InputStream and the associated content-type header.
     * <p>
-     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
+     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
-     * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
+     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
     * prolog encoding uses the default encoding mandated by the content-type MIME type.
     * <p>
-     * It does a lenient charset encoding detection, check the constructor with the lenient
+     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
     * parameter for details.
     * <p>
     * 
     * @param is InputStream to create the reader from.
@ -281,15 +263,12 @@ public class XmlReader extends Reader {
    /**
     * Creates a Reader using an InputStream and the associated content-type header.
     * <p>
-     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
+     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
-     * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
+     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
     * prolog encoding uses the default encoding mandated by the content-type MIME type.
     * <p>
-     * If lenient detection is indicated and the detection above fails as per specifications it then
+     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
     * attempts the following:
     * <p>
-     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection
+     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
     * again.
     * <p>
     * Else if the XML prolog had a charset encoding that encoding is used.
     * <p>
@ -305,8 +284,7 @@ public class XmlReader extends Reader {
     * @param lenient indicates if the charset encoding detection should be relaxed.
     * @param defaultEncoding default encoding to use if one cannot be detected.
     * @throws IOException thrown if there is a problem reading the file.
-     * @throws XmlReaderException thrown if the charset encoding could not be determined according
+     * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
     *             to the specs.
     * 
     */
    public XmlReader(final InputStream is, final String httpContentType, final boolean lenient, final String defaultEncoding) throws IOException,
@ -330,15 +308,12 @@ public class XmlReader extends Reader {
    /**
     * Creates a Reader using an InputStream and the associated content-type header.
     * <p>
-     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
+     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
-     * If there is not content-type encoding checks the XML prolog encoding. If there is not XML
+     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
     * prolog encoding uses the default encoding mandated by the content-type MIME type.
     * <p>
-     * If lenient detection is indicated and the detection above fails as per specifications it then
+     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
     * attempts the following:
     * <p>
-     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection
+     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
     * again.
     * <p>
     * Else if the XML prolog had a charset encoding that encoding is used.
     * <p>
@ -353,8 +328,7 @@ public class XmlReader extends Reader {
     * @param httpContentType content-type header to use for the resolution of the charset encoding.
     * @param lenient indicates if the charset encoding detection should be relaxed.
     * @throws IOException thrown if there is a problem reading the file.
-     * @throws XmlReaderException thrown if the charset encoding could not be determined according
+     * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
     *             to the specs.
     * 
     */
    public XmlReader(final InputStream is, final String httpContentType, final boolean lenient) throws IOException, XmlReaderException {
@ -362,8 +336,7 @@ public class XmlReader extends Reader {
    }
    /**
-     * Returns the default encoding to use if none is set in HTTP content-type, XML prolog and the
+     * Returns the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
     * rules based on content-type are not adequate.
     * <p/>
     * If it is NULL the content-type based rules are used.
     * <p/>
@ -375,8 +348,7 @@ public class XmlReader extends Reader {
    }
    /**
-     * Sets the default encoding to use if none is set in HTTP content-type, XML prolog and the
+     * Sets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
     * rules based on content-type are not adequate.
     * <p/>
     * If it is set to NULL the content-type based rules are used.
     * <p/>