enhancement for #181 - sets user-agent header on UrlConnection
This commit is contained in:
parent
fee63dd5b1
commit
95629798ba
2 changed files with 59 additions and 74 deletions
13
pom.xml
13
pom.xml
|
@ -78,6 +78,19 @@
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-jar-plugin</artifactId>
|
||||||
|
<version>2.4</version>
|
||||||
|
<configuration>
|
||||||
|
<archive>
|
||||||
|
<manifest>
|
||||||
|
<addDefaultImplementationEntries>true</addDefaultImplementationEntries>
|
||||||
|
<addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
|
||||||
|
</manifest>
|
||||||
|
</archive>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
<artifactId>maven-site-plugin</artifactId>
|
<artifactId>maven-site-plugin</artifactId>
|
||||||
|
|
|
@ -34,25 +34,18 @@ import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out the
|
* Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out the charset encoding of the XML document within the stream.
|
||||||
* charset encoding of the XML document within the stream.
|
|
||||||
* <p>
|
* <p>
|
||||||
* IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a
|
* IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
|
||||||
* character stream.
|
|
||||||
* <p>
|
* <p>
|
||||||
* All this has to be done without consuming characters from the stream, if not the XML parser will
|
* All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
|
||||||
* not recognized the document as a valid XML. This is not 100% true, but it's close enough (UTF-8
|
* true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlReader handles it and things work in all parsers).
|
||||||
* BOM is not handled by all parsers right now, XmlReader handles it and things work in all
|
|
||||||
* parsers).
|
|
||||||
* <p>
|
* <p>
|
||||||
* The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP
|
* The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
|
||||||
* streams by offering a wide set of constructors.
|
|
||||||
* <P>
|
* <P>
|
||||||
* By default the charset encoding detection is lenient, the constructor with the lenient flag can
|
* By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script (following HTTP MIME and XML
|
||||||
* be used for an script (following HTTP MIME and XML specifications). All this is nicely explained
|
* specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining
|
||||||
* by Mark Pilgrim in his blog, <a
|
* the character encoding of a feed</a>.
|
||||||
* href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character
|
|
||||||
* encoding of a feed</a>.
|
|
||||||
* <p>
|
* <p>
|
||||||
*
|
*
|
||||||
* @author Alejandro Abdelnur
|
* @author Alejandro Abdelnur
|
||||||
|
@ -87,11 +80,9 @@ public class XmlReader extends Reader {
|
||||||
/**
|
/**
|
||||||
* Creates a Reader for a File.
|
* Creates a Reader for a File.
|
||||||
* <p>
|
* <p>
|
||||||
* It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also
|
* It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
|
||||||
* missing defaults to UTF-8.
|
|
||||||
* <p>
|
* <p>
|
||||||
* It does a lenient charset encoding detection, check the constructor with the lenient
|
* It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
|
||||||
* parameter for details.
|
|
||||||
* <p>
|
* <p>
|
||||||
*
|
*
|
||||||
* @param file File to create a Reader from.
|
* @param file File to create a Reader from.
|
||||||
|
@ -107,8 +98,7 @@ public class XmlReader extends Reader {
|
||||||
* <p>
|
* <p>
|
||||||
* It follows the same logic used for files.
|
* It follows the same logic used for files.
|
||||||
* <p>
|
* <p>
|
||||||
* It does a lenient charset encoding detection, check the constructor with the lenient
|
* It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
|
||||||
* parameter for details.
|
|
||||||
* <p>
|
* <p>
|
||||||
*
|
*
|
||||||
* @param is InputStream to create a Reader from.
|
* @param is InputStream to create a Reader from.
|
||||||
|
@ -120,16 +110,13 @@ public class XmlReader extends Reader {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a Reader for a raw InputStream and uses the provided default encoding if none is
|
* Creates a Reader for a raw InputStream and uses the provided default encoding if none is determined.
|
||||||
* determined.
|
|
||||||
* <p>
|
* <p>
|
||||||
* It follows the same logic used for files.
|
* It follows the same logic used for files.
|
||||||
* <p>
|
* <p>
|
||||||
* If lenient detection is indicated and the detection above fails as per specifications it then
|
* If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
|
||||||
* attempts the following:
|
|
||||||
* <p>
|
* <p>
|
||||||
* If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection
|
* If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
|
||||||
* again.
|
|
||||||
* <p>
|
* <p>
|
||||||
* Else if the XML prolog had a charset encoding that encoding is used.
|
* Else if the XML prolog had a charset encoding that encoding is used.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -144,8 +131,7 @@ public class XmlReader extends Reader {
|
||||||
* @param lenient indicates if the charset encoding detection should be relaxed.
|
* @param lenient indicates if the charset encoding detection should be relaxed.
|
||||||
* @param defaultEncoding default encoding to use if one cannot be detected.
|
* @param defaultEncoding default encoding to use if one cannot be detected.
|
||||||
* @throws IOException thrown if there is a problem reading the stream.
|
* @throws IOException thrown if there is a problem reading the stream.
|
||||||
* @throws XmlReaderException thrown if the charset encoding could not be determined according
|
* @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
|
||||||
* to the specs.
|
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public XmlReader(final InputStream is, final boolean lenient, final String defaultEncoding) throws IOException, XmlReaderException {
|
public XmlReader(final InputStream is, final boolean lenient, final String defaultEncoding) throws IOException, XmlReaderException {
|
||||||
|
@ -170,11 +156,9 @@ public class XmlReader extends Reader {
|
||||||
* <p>
|
* <p>
|
||||||
* It follows the same logic used for files.
|
* It follows the same logic used for files.
|
||||||
* <p>
|
* <p>
|
||||||
* If lenient detection is indicated and the detection above fails as per specifications it then
|
* If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
|
||||||
* attempts the following:
|
|
||||||
* <p>
|
* <p>
|
||||||
* If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection
|
* If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
|
||||||
* again.
|
|
||||||
* <p>
|
* <p>
|
||||||
* Else if the XML prolog had a charset encoding that encoding is used.
|
* Else if the XML prolog had a charset encoding that encoding is used.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -188,8 +172,7 @@ public class XmlReader extends Reader {
|
||||||
* @param is InputStream to create a Reader from.
|
* @param is InputStream to create a Reader from.
|
||||||
* @param lenient indicates if the charset encoding detection should be relaxed.
|
* @param lenient indicates if the charset encoding detection should be relaxed.
|
||||||
* @throws IOException thrown if there is a problem reading the stream.
|
* @throws IOException thrown if there is a problem reading the stream.
|
||||||
* @throws XmlReaderException thrown if the charset encoding could not be determined according
|
* @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
|
||||||
* to the specs.
|
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public XmlReader(final InputStream is, final boolean lenient) throws IOException, XmlReaderException {
|
public XmlReader(final InputStream is, final boolean lenient) throws IOException, XmlReaderException {
|
||||||
|
@ -199,14 +182,11 @@ public class XmlReader extends Reader {
|
||||||
/**
|
/**
|
||||||
* Creates a Reader using the InputStream of a URL.
|
* Creates a Reader using the InputStream of a URL.
|
||||||
* <p>
|
* <p>
|
||||||
* If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it
|
* If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
|
||||||
* uses the same logic used for Files.
|
|
||||||
* <p>
|
* <p>
|
||||||
* If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the
|
* If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
|
||||||
* same logic used for an InputStream with content-type.
|
|
||||||
* <p>
|
* <p>
|
||||||
* It does a lenient charset encoding detection, check the constructor with the lenient
|
* It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
|
||||||
* parameter for details.
|
|
||||||
* <p>
|
* <p>
|
||||||
*
|
*
|
||||||
* @param url URL to create a Reader from.
|
* @param url URL to create a Reader from.
|
||||||
|
@ -220,14 +200,12 @@ public class XmlReader extends Reader {
|
||||||
/**
|
/**
|
||||||
* Creates a Reader using the InputStream of a URLConnection.
|
* Creates a Reader using the InputStream of a URLConnection.
|
||||||
* <p>
|
* <p>
|
||||||
* If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header
|
* If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
|
||||||
* in the fetched data it uses the same logic used for files.
|
|
||||||
* <p>
|
* <p>
|
||||||
* If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it
|
* If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
|
||||||
* uses the same logic used for an InputStream with content-type.
|
* content-type.
|
||||||
* <p>
|
* <p>
|
||||||
* It does a lenient charset encoding detection, check the constructor with the lenient
|
* It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
|
||||||
* parameter for details.
|
|
||||||
* <p>
|
* <p>
|
||||||
*
|
*
|
||||||
* @param conn URLConnection to create a Reader from.
|
* @param conn URLConnection to create a Reader from.
|
||||||
|
@ -238,6 +216,12 @@ public class XmlReader extends Reader {
|
||||||
defaultEncoding = staticDefaultEncoding;
|
defaultEncoding = staticDefaultEncoding;
|
||||||
final boolean lenient = true;
|
final boolean lenient = true;
|
||||||
if (conn instanceof HttpURLConnection) {
|
if (conn instanceof HttpURLConnection) {
|
||||||
|
final Package pckg = this.getClass().getPackage();
|
||||||
|
if (pckg.getImplementationTitle() != null && pckg.getImplementationVersion() != null) {
|
||||||
|
conn.setRequestProperty("User-Agent", pckg.getImplementationTitle() + "/" + pckg.getImplementationVersion());
|
||||||
|
} else {
|
||||||
|
conn.setRequestProperty("User-Agent", "ROME");
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
doHttpStream(conn.getInputStream(), conn.getContentType(), lenient);
|
doHttpStream(conn.getInputStream(), conn.getContentType(), lenient);
|
||||||
} catch (final XmlReaderException ex) {
|
} catch (final XmlReaderException ex) {
|
||||||
|
@ -261,12 +245,10 @@ public class XmlReader extends Reader {
|
||||||
/**
|
/**
|
||||||
* Creates a Reader using an InputStream and the associated content-type header.
|
* Creates a Reader using an InputStream and the associated content-type header.
|
||||||
* <p>
|
* <p>
|
||||||
* First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
|
* First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
|
||||||
* If there is not content-type encoding checks the XML prolog encoding. If there is not XML
|
* encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
|
||||||
* prolog encoding uses the default encoding mandated by the content-type MIME type.
|
|
||||||
* <p>
|
* <p>
|
||||||
* It does a lenient charset encoding detection, check the constructor with the lenient
|
* It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
|
||||||
* parameter for details.
|
|
||||||
* <p>
|
* <p>
|
||||||
*
|
*
|
||||||
* @param is InputStream to create the reader from.
|
* @param is InputStream to create the reader from.
|
||||||
|
@ -281,15 +263,12 @@ public class XmlReader extends Reader {
|
||||||
/**
|
/**
|
||||||
* Creates a Reader using an InputStream and the associated content-type header.
|
* Creates a Reader using an InputStream and the associated content-type header.
|
||||||
* <p>
|
* <p>
|
||||||
* First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
|
* First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
|
||||||
* If there is not content-type encoding checks the XML prolog encoding. If there is not XML
|
* encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
|
||||||
* prolog encoding uses the default encoding mandated by the content-type MIME type.
|
|
||||||
* <p>
|
* <p>
|
||||||
* If lenient detection is indicated and the detection above fails as per specifications it then
|
* If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
|
||||||
* attempts the following:
|
|
||||||
* <p>
|
* <p>
|
||||||
* If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection
|
* If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
|
||||||
* again.
|
|
||||||
* <p>
|
* <p>
|
||||||
* Else if the XML prolog had a charset encoding that encoding is used.
|
* Else if the XML prolog had a charset encoding that encoding is used.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -305,8 +284,7 @@ public class XmlReader extends Reader {
|
||||||
* @param lenient indicates if the charset encoding detection should be relaxed.
|
* @param lenient indicates if the charset encoding detection should be relaxed.
|
||||||
* @param defaultEncoding default encoding to use if one cannot be detected.
|
* @param defaultEncoding default encoding to use if one cannot be detected.
|
||||||
* @throws IOException thrown if there is a problem reading the file.
|
* @throws IOException thrown if there is a problem reading the file.
|
||||||
* @throws XmlReaderException thrown if the charset encoding could not be determined according
|
* @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
|
||||||
* to the specs.
|
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public XmlReader(final InputStream is, final String httpContentType, final boolean lenient, final String defaultEncoding) throws IOException,
|
public XmlReader(final InputStream is, final String httpContentType, final boolean lenient, final String defaultEncoding) throws IOException,
|
||||||
|
@ -330,15 +308,12 @@ public class XmlReader extends Reader {
|
||||||
/**
|
/**
|
||||||
* Creates a Reader using an InputStream and the associated content-type header.
|
* Creates a Reader using an InputStream and the associated content-type header.
|
||||||
* <p>
|
* <p>
|
||||||
* First it checks if the stream has BOM. If there is not BOM checks the content-type encoding.
|
* First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
|
||||||
* If there is not content-type encoding checks the XML prolog encoding. If there is not XML
|
* encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
|
||||||
* prolog encoding uses the default encoding mandated by the content-type MIME type.
|
|
||||||
* <p>
|
* <p>
|
||||||
* If lenient detection is indicated and the detection above fails as per specifications it then
|
* If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
|
||||||
* attempts the following:
|
|
||||||
* <p>
|
* <p>
|
||||||
* If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection
|
* If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
|
||||||
* again.
|
|
||||||
* <p>
|
* <p>
|
||||||
* Else if the XML prolog had a charset encoding that encoding is used.
|
* Else if the XML prolog had a charset encoding that encoding is used.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -353,8 +328,7 @@ public class XmlReader extends Reader {
|
||||||
* @param httpContentType content-type header to use for the resolution of the charset encoding.
|
* @param httpContentType content-type header to use for the resolution of the charset encoding.
|
||||||
* @param lenient indicates if the charset encoding detection should be relaxed.
|
* @param lenient indicates if the charset encoding detection should be relaxed.
|
||||||
* @throws IOException thrown if there is a problem reading the file.
|
* @throws IOException thrown if there is a problem reading the file.
|
||||||
* @throws XmlReaderException thrown if the charset encoding could not be determined according
|
* @throws XmlReaderException thrown if the charset encoding could not be determined according to the specs.
|
||||||
* to the specs.
|
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public XmlReader(final InputStream is, final String httpContentType, final boolean lenient) throws IOException, XmlReaderException {
|
public XmlReader(final InputStream is, final String httpContentType, final boolean lenient) throws IOException, XmlReaderException {
|
||||||
|
@ -362,8 +336,7 @@ public class XmlReader extends Reader {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the default encoding to use if none is set in HTTP content-type, XML prolog and the
|
* Returns the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
|
||||||
* rules based on content-type are not adequate.
|
|
||||||
* <p/>
|
* <p/>
|
||||||
* If it is NULL the content-type based rules are used.
|
* If it is NULL the content-type based rules are used.
|
||||||
* <p/>
|
* <p/>
|
||||||
|
@ -375,8 +348,7 @@ public class XmlReader extends Reader {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the default encoding to use if none is set in HTTP content-type, XML prolog and the
|
* Sets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
|
||||||
* rules based on content-type are not adequate.
|
|
||||||
* <p/>
|
* <p/>
|
||||||
* If it is set to NULL the content-type based rules are used.
|
* If it is set to NULL the content-type based rules are used.
|
||||||
* <p/>
|
* <p/>
|
||||||
|
|
Loading…
Reference in a new issue