diff --git a/pom.xml b/pom.xml index a09d306..b42af04 100644 --- a/pom.xml +++ b/pom.xml @@ -78,6 +78,19 @@ + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + + true + true + + + + org.apache.maven.plugins maven-site-plugin diff --git a/src/main/java/com/sun/syndication/io/XmlReader.java b/src/main/java/com/sun/syndication/io/XmlReader.java index 814bc82..cef999f 100644 --- a/src/main/java/com/sun/syndication/io/XmlReader.java +++ b/src/main/java/com/sun/syndication/io/XmlReader.java @@ -34,25 +34,18 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out the - * charset encoding of the XML document within the stream. + * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out the charset encoding of the XML document within the stream. *

- * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a - * character stream. + * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream. *

- * All this has to be done without consuming characters from the stream, if not the XML parser will - * not recognized the document as a valid XML. This is not 100% true, but it's close enough (UTF-8 - * BOM is not handled by all parsers right now, XmlReader handles it and things work in all - * parsers). + * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100% + * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all parsers). *

- * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP - * streams by offering a wide set of constructors. + * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors. *

- * By default the charset encoding detection is lenient, the constructor with the lenient flag can - * be used for an script (following HTTP MIME and XML specifications). All this is nicely explained - * by Mark Pilgrim in his blog, Determining the character - * encoding of a feed. + * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script (following HTTP MIME and XML + * specifications). All this is nicely explained by Mark Pilgrim in his blog, Determining + * the character encoding of a feed. *

* * @author Alejandro Abdelnur @@ -87,11 +80,9 @@ public class XmlReader extends Reader { /** * Creates a Reader for a File. *

- * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also - * missing defaults to UTF-8. + * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8. *

- * It does a lenient charset encoding detection, check the constructor with the lenient - * parameter for details. + * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. *

* * @param file File to create a Reader from. @@ -107,8 +98,7 @@ public class XmlReader extends Reader { *

* It follows the same logic used for files. *

- * It does a lenient charset encoding detection, check the constructor with the lenient - * parameter for details. + * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. *

* * @param is InputStream to create a Reader from. @@ -120,16 +110,13 @@ public class XmlReader extends Reader { } /** - * Creates a Reader for a raw InputStream and uses the provided default encoding if none is - * determined. + * Creates a Reader for a raw InputStream and uses the provided default encoding if none is determined. *

* It follows the same logic used for files. *

- * If lenient detection is indicated and the detection above fails as per specifications it then - * attempts the following: + * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: *

- * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection - * again. + * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. *

* Else if the XML prolog had a charset encoding that encoding is used. *

@@ -144,8 +131,7 @@ public class XmlReader extends Reader { * @param lenient indicates if the charset encoding detection should be relaxed. * @param defaultEncoding default encoding to use if one cannot be detected. * @throws IOException thrown if there is a problem reading the stream. - * @throws XmlReaderException thrown if the charset encoding could not be determined according - * to the specs. + * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs. * */ public XmlReader(final InputStream is, final boolean lenient, final String defaultEncoding) throws IOException, XmlReaderException { @@ -170,11 +156,9 @@ public class XmlReader extends Reader { *

* It follows the same logic used for files. *

- * If lenient detection is indicated and the detection above fails as per specifications it then - * attempts the following: + * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: *

- * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection - * again. + * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. *

* Else if the XML prolog had a charset encoding that encoding is used. *

@@ -188,8 +172,7 @@ public class XmlReader extends Reader { * @param is InputStream to create a Reader from. * @param lenient indicates if the charset encoding detection should be relaxed. * @throws IOException thrown if there is a problem reading the stream. - * @throws XmlReaderException thrown if the charset encoding could not be determined according - * to the specs. + * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs. * */ public XmlReader(final InputStream is, final boolean lenient) throws IOException, XmlReaderException { @@ -199,14 +182,11 @@ public class XmlReader extends Reader { /** * Creates a Reader using the InputStream of a URL. *

- * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it - * uses the same logic used for Files. + * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files. *

- * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the - * same logic used for an InputStream with content-type. + * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type. *

- * It does a lenient charset encoding detection, check the constructor with the lenient - * parameter for details. + * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. *

* * @param url URL to create a Reader from. @@ -220,14 +200,12 @@ public class XmlReader extends Reader { /** * Creates a Reader using the InputStream of a URLConnection. *

- * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header - * in the fetched data it uses the same logic used for files. + * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files. *

- * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it - * uses the same logic used for an InputStream with content-type. + * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with + * content-type. *

- * It does a lenient charset encoding detection, check the constructor with the lenient - * parameter for details. + * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. *

* * @param conn URLConnection to create a Reader from. @@ -238,6 +216,12 @@ public class XmlReader extends Reader { defaultEncoding = staticDefaultEncoding; final boolean lenient = true; if (conn instanceof HttpURLConnection) { + final Package pckg = this.getClass().getPackage(); + if (pckg.getImplementationTitle() != null && pckg.getImplementationVersion() != null) { + conn.setRequestProperty("User-Agent", pckg.getImplementationTitle() + "/" + pckg.getImplementationVersion()); + } else { + conn.setRequestProperty("User-Agent", "ROME"); + } try { doHttpStream(conn.getInputStream(), conn.getContentType(), lenient); } catch (final XmlReaderException ex) { @@ -261,12 +245,10 @@ public class XmlReader extends Reader { /** * Creates a Reader using an InputStream and the associated content-type header. *

- * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. - * If there is not content-type encoding checks the XML prolog encoding. If there is not XML - * prolog encoding uses the default encoding mandated by the content-type MIME type. + * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog + * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. *

- * It does a lenient charset encoding detection, check the constructor with the lenient - * parameter for details. + * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. *

* * @param is InputStream to create the reader from. @@ -281,15 +263,12 @@ public class XmlReader extends Reader { /** * Creates a Reader using an InputStream and the associated content-type header. *

- * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. - * If there is not content-type encoding checks the XML prolog encoding. If there is not XML - * prolog encoding uses the default encoding mandated by the content-type MIME type. + * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog + * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. *

- * If lenient detection is indicated and the detection above fails as per specifications it then - * attempts the following: + * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: *

- * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection - * again. + * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. *

* Else if the XML prolog had a charset encoding that encoding is used. *

@@ -305,8 +284,7 @@ public class XmlReader extends Reader { * @param lenient indicates if the charset encoding detection should be relaxed. * @param defaultEncoding default encoding to use if one cannot be detected. * @throws IOException thrown if there is a problem reading the file. - * @throws XmlReaderException thrown if the charset encoding could not be determined according - * to the specs. + * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs. * */ public XmlReader(final InputStream is, final String httpContentType, final boolean lenient, final String defaultEncoding) throws IOException, @@ -330,15 +308,12 @@ public class XmlReader extends Reader { /** * Creates a Reader using an InputStream and the associated content-type header. *

- * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. - * If there is not content-type encoding checks the XML prolog encoding. If there is not XML - * prolog encoding uses the default encoding mandated by the content-type MIME type. + * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog + * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. *

- * If lenient detection is indicated and the detection above fails as per specifications it then - * attempts the following: + * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: *

- * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection - * again. + * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. *

* Else if the XML prolog had a charset encoding that encoding is used. *

@@ -353,8 +328,7 @@ public class XmlReader extends Reader { * @param httpContentType content-type header to use for the resolution of the charset encoding. * @param lenient indicates if the charset encoding detection should be relaxed. * @throws IOException thrown if there is a problem reading the file. - * @throws XmlReaderException thrown if the charset encoding could not be determined according - * to the specs. + * @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs. * */ public XmlReader(final InputStream is, final String httpContentType, final boolean lenient) throws IOException, XmlReaderException { @@ -362,8 +336,7 @@ public class XmlReader extends Reader { } /** - * Returns the default encoding to use if none is set in HTTP content-type, XML prolog and the - * rules based on content-type are not adequate. + * Returns the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate. *

* If it is NULL the content-type based rules are used. *

@@ -375,8 +348,7 @@ public class XmlReader extends Reader { } /** - * Sets the default encoding to use if none is set in HTTP content-type, XML prolog and the - * rules based on content-type are not adequate. + * Sets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate. *

* If it is set to NULL the content-type based rules are used. *