View Javadoc

1   /*
2    * Copyright 2005 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.io.impl;
18  
19  import java.io.IOException;
20  import java.io.Reader;
21  import java.util.HashMap;
22  import java.util.Map;
23  import java.util.regex.Pattern;
24  import java.util.regex.Matcher;
25  
26  /***
27   * @author Alejandro Abdelnur
28   */
29  public class XmlFixerReader extends Reader {
30  
31      protected Reader in;
32  
33      public XmlFixerReader(Reader in) {
34          super(in);
35          this.in = in;
36          _buffer = new StringBuffer();
37          _state = 0;
38      }
39  
40      private boolean trimmed;
41      private StringBuffer _buffer;
42      private int _bufferPos;
43      private int _state = 0;
44  
45      private boolean trimStream() throws IOException {
46          boolean hasContent = true;
47          int state = 0;
48          boolean loop;
49          int c;
50          do {
51              switch (state) {
52                  case 0:
53                      c = in.read();
54                      if (c==-1) {
55                          loop = false;
56                          hasContent = false;
57                      }
58                      else
59                      if (c==' ' || c=='\n') {
60                          loop = true;
61                      }
62                      else
63                      if (c=='<') {
64                          state = 1;
65                          _buffer.setLength(0);
66                          _bufferPos = 0;
67                          _buffer.append((char)c);
68                          loop = true;
69                      }
70                      else {
71                          _buffer.setLength(0);
72                          _bufferPos = 0;
73                          _buffer.append((char)c);
74                          loop = false;
75                          hasContent = true;
76                          _state = 3;
77                      }
78                      break;
79                  case 1:
80                      c = in.read();
81                      if (c==-1) {
82                          loop = false;
83                          hasContent = true;
84                          _state = 3;
85                      }
86                      else
87                      if (c!='!') {
88                          _buffer.append((char)c);
89                          _state = 3;
90                          loop = false;
91                          hasContent = true;
92                          _state = 3;
93                      }
94                      else {
95                          _buffer.append((char)c);
96                          state = 2;
97                          loop = true;
98                      }
99                      break;
100                 case 2:
101                     c = in.read();
102                     if (c==-1) {
103                         loop = false;
104                         hasContent = true;
105                         _state = 3;
106                     }
107                     else
108                     if (c=='-') {
109                         _buffer.append((char)c);
110                         state = 3;
111                         loop = true;
112                     }
113                     else {
114                         _buffer.append((char)c);
115                         loop = false;
116                         hasContent = true;
117                         _state = 3;
118                     }
119                     break;
120                 case 3:
121                     c = in.read();
122                     if (c==-1) {
123                         loop = false;
124                         hasContent = true;
125                         _state = 3;
126                     }
127                     else
128                     if (c=='-') {
129                         _buffer.append((char)c);
130                         state = 4;
131                         loop = true;
132                     }
133                     else {
134                         _buffer.append((char)c);
135                         loop = false;
136                         hasContent = true;
137                         _state = 3;
138                     }
139                     break;
140                 case 4:
141                     c = in.read();
142                     if (c==-1) {
143                         loop = false;
144                         hasContent = true;
145                         _state = 3;
146                     }
147                     else
148                     if (c!='-') {
149                         _buffer.append((char)c);
150                         loop = true;
151                     }
152                     else {
153                         _buffer.append((char)c);
154                         state = 5;
155                         loop = true;
156                     }
157                     break;
158                 case 5:
159                     c = in.read();
160                     if (c==-1) {
161                         loop = false;
162                         hasContent = true;
163                         _state = 3;
164                     }
165                     else
166                     if (c!='-') {
167                         _buffer.append((char)c);
168                         loop = true;
169                         state = 4;
170                     }
171                     else {
172                         _buffer.append((char)c);
173                         state = 6;
174                         loop = true;
175                     }
176                     break;
177                 case 6:
178                     c = in.read();
179                     if (c==-1) {
180                         loop = false;
181                         hasContent = true;
182                         _state = 3;
183                     }
184                     else
185                     if (c!='>') {
186                         _buffer.append((char)c);
187                         loop = true;
188                         state = 4;
189                     }
190                     else {
191                         _buffer.setLength(0);
192                         state = 0;
193                         loop = true;
194                     }
195                     break;
196                 default:
197                     throw new IOException("It shouldn't happen");
198             }
199         } while (loop);
200         return hasContent;
201     }
202 
203     public int read() throws IOException {
204         boolean loop;
205         if (!trimmed) { // trims XML stream
206             trimmed = true;
207             if (!trimStream()) {
208                 return -1;
209             }
210         }
211         int c;
212         do { // converts literal entities to coded entities
213             switch (_state) {
214                 case 0: // reading chars from stream
215                     c = in.read();
216                     if (c>-1) {
217                         if (c=='&') {
218                             _state = 1;
219                             _buffer.setLength(0);
220                             _bufferPos = 0;
221                             _buffer.append((char)c);
222                             _state = 1;
223                             loop = true;
224                         }
225                         else {
226                             loop = false;
227                         }
228                     }
229                     else {
230                         loop = false;
231                     }
232                     break;
233                 case 1: // reading entity from stream
234                     c = in.read();
235                     if (c>-1) {
236                         if (c==';') {
237                             _buffer.append((char)c);
238                             _state = 2;
239                             loop = true;
240                         }
241                         else
242                         if ((c>='a' && c<='z') || (c>='A' && c<='Z') || (c=='#') || (c>='0' && c<='9')) {
243                             _buffer.append((char)c);
244                             loop = true;
245                         }
246                         else {
247                             // no ';' to match the '&' lets just make the '&'
248                             // a legal xml character entity '&amp;'
249                             _buffer.insert(1, "amp;");
250                             _buffer.append((char)c);
251                             _state = 3;
252                             loop = true;
253                         }
254                     }
255                     else {
256                         // no ';' to match the '&' lets just make the '&'
257                         // a legal xml character entity '&amp;'
258                         _buffer.insert(1, "amp;");
259                         _state = 3;
260                         loop = true;
261                     }
262                     break;
263                 case 2: // replacing entity
264                     c = 0;
265                     String literalEntity = _buffer.toString();
266                     String codedEntity = (String) CODED_ENTITIES.get(literalEntity);
267                     if (codedEntity!=null) {
268                         _buffer.setLength(0);
269                         _buffer.append(codedEntity);
270                     } // else we leave what was in the stream
271                     _state = 3;
272                     loop = true;
273                     break;
274                 case 3: // consuming buffer
275                     if (_bufferPos<_buffer.length()) {
276                         c = _buffer.charAt(_bufferPos++);
277                         loop = false;
278                     }
279                     else {
280                         c = 0;
281                         _state = 0;
282                         loop = true;
283                     }
284                     break;
285                  default:
286                     throw new IOException("It shouldn't happen");
287             }
288         } while (loop);
289         return c;
290     }
291 
292     public int read(char[] buffer,int offset,int len) throws IOException {
293         int charsRead = 0;
294         int c = read();
295         if (c==-1) {
296             return -1;
297         }
298         buffer[offset+(charsRead++)] = (char) c;
299         while (charsRead<len && (c=read())>-1) {
300             buffer[offset+(charsRead++)] = (char) c;
301         }
302         return charsRead;
303     }
304 
305     public long skip(long n) throws IOException {
306         if (n==0) {
307             return 0;
308         }
309         else
310         if (n<0) {
311             throw new IllegalArgumentException("'n' cannot be negative");
312         }
313         int c = read();
314         long counter = 1;
315         while (c>-1 && counter<n) {
316             c = read();
317             counter++;
318         }
319         return counter;
320     }
321 
322     public boolean ready() throws IOException {
323         return (_state!=0) || in.ready();
324     }
325 
326     public boolean markSupported() {
327         return false;
328     }
329 
330     public void mark(int readAheadLimit) throws IOException {
331         throw new IOException("Stream does not support mark");
332     }
333 
334     public void reset() throws IOException {
335         throw new IOException("Stream does not support mark");
336     }
337 
338     public void close() throws IOException {
339         in.close();
340     }
341 
342     private static Map CODED_ENTITIES = new HashMap();
343 
344     static {
345         // note: refer to Character entity references in HTML 4
346         // at http://www.w3.org/TR/REC-html40/sgml/entities.html
347 
348     	// Character entity set.
349     	// HTMLlat1 "-//W3C//ENTITIES Latin 1//EN//HTML"
350 
351     	CODED_ENTITIES.put("&nbsp;",  "&#160;");
352         CODED_ENTITIES.put("&iexcl;", "&#161;");
353         CODED_ENTITIES.put("&cent;",  "&#162;");
354         CODED_ENTITIES.put("&pound;", "&#163;");
355         CODED_ENTITIES.put("&curren;","&#164;");
356         CODED_ENTITIES.put("&yen;",   "&#165;");
357         CODED_ENTITIES.put("&brvbar;","&#166;");
358         CODED_ENTITIES.put("&sect;",  "&#167;");
359         CODED_ENTITIES.put("&uml;",   "&#168;");
360         CODED_ENTITIES.put("&copy;",  "&#169;");
361         CODED_ENTITIES.put("&ordf;",  "&#170;");
362         CODED_ENTITIES.put("&laquo;", "&#171;");
363         CODED_ENTITIES.put("&not;",   "&#172;");
364         CODED_ENTITIES.put("&shy;",   "&#173;");
365         CODED_ENTITIES.put("&reg;",   "&#174;");
366         CODED_ENTITIES.put("&macr;",  "&#175;");
367         CODED_ENTITIES.put("&deg;",   "&#176;");
368         CODED_ENTITIES.put("&plusmn;","&#177;");
369         CODED_ENTITIES.put("&sup2;",  "&#178;");
370         CODED_ENTITIES.put("&sup3;",  "&#179;");
371         CODED_ENTITIES.put("&acute;", "&#180;");
372         CODED_ENTITIES.put("&micro;", "&#181;");
373         CODED_ENTITIES.put("&para;",  "&#182;");
374         CODED_ENTITIES.put("&middot;","&#183;");
375         CODED_ENTITIES.put("&cedil;", "&#184;");
376         CODED_ENTITIES.put("&sup1;",  "&#185;");
377         CODED_ENTITIES.put("&ordm;",  "&#186;");
378         CODED_ENTITIES.put("&raquo;", "&#187;");
379         CODED_ENTITIES.put("&frac14;","&#188;");
380         CODED_ENTITIES.put("&frac12;","&#189;");
381         CODED_ENTITIES.put("&frac34;","&#190;");
382         CODED_ENTITIES.put("&iquest;","&#191;");
383         CODED_ENTITIES.put("&Agrave;","&#192;");
384         CODED_ENTITIES.put("&Aacute;","&#193;");
385         CODED_ENTITIES.put("&Acirc;", "&#194;");
386         CODED_ENTITIES.put("&Atilde;","&#195;");
387         CODED_ENTITIES.put("&Auml;",  "&#196;");
388         CODED_ENTITIES.put("&Aring;", "&#197;");
389         CODED_ENTITIES.put("&AElig;", "&#198;");
390         CODED_ENTITIES.put("&Ccedil;","&#199;");
391         CODED_ENTITIES.put("&Egrave;","&#200;");
392         CODED_ENTITIES.put("&Eacute;","&#201;");
393         CODED_ENTITIES.put("&Ecirc;", "&#202;");
394         CODED_ENTITIES.put("&Euml;",  "&#203;");
395         CODED_ENTITIES.put("&Igrave;","&#204;");
396         CODED_ENTITIES.put("&Iacute;","&#205;");
397         CODED_ENTITIES.put("&Icirc;", "&#206;");
398         CODED_ENTITIES.put("&Iuml;",  "&#207;");
399         CODED_ENTITIES.put("&ETH;",   "&#208;");
400         CODED_ENTITIES.put("&Ntilde;","&#209;");
401         CODED_ENTITIES.put("&Ograve;","&#210;");
402         CODED_ENTITIES.put("&Oacute;","&#211;");
403         CODED_ENTITIES.put("&Ocirc;", "&#212;");
404         CODED_ENTITIES.put("&Otilde;","&#213;");
405         CODED_ENTITIES.put("&Ouml;",  "&#214;");
406         CODED_ENTITIES.put("&times;", "&#215;");
407         CODED_ENTITIES.put("&Oslash;","&#216;");
408         CODED_ENTITIES.put("&Ugrave;","&#217;");
409         CODED_ENTITIES.put("&Uacute;","&#218;");
410         CODED_ENTITIES.put("&Ucirc;", "&#219;");
411         CODED_ENTITIES.put("&Uuml;",  "&#220;");
412         CODED_ENTITIES.put("&Yacute;","&#221;");
413         CODED_ENTITIES.put("&THORN;", "&#222;");
414         CODED_ENTITIES.put("&szlig;", "&#223;");
415         CODED_ENTITIES.put("&agrave;","&#224;");
416         CODED_ENTITIES.put("&aacute;","&#225;");
417         CODED_ENTITIES.put("&acirc;", "&#226;");
418         CODED_ENTITIES.put("&atilde;","&#227;");
419         CODED_ENTITIES.put("&auml;",  "&#228;");
420         CODED_ENTITIES.put("&aring;", "&#229;");
421         CODED_ENTITIES.put("&aelig;", "&#230;");
422         CODED_ENTITIES.put("&ccedil;","&#231;");
423         CODED_ENTITIES.put("&egrave;","&#232;");
424         CODED_ENTITIES.put("&eacute;","&#233;");
425         CODED_ENTITIES.put("&ecirc;", "&#234;");
426         CODED_ENTITIES.put("&euml;",  "&#235;");
427         CODED_ENTITIES.put("&igrave;","&#236;");
428         CODED_ENTITIES.put("&iacute;","&#237;");
429         CODED_ENTITIES.put("&icirc;", "&#238;");
430         CODED_ENTITIES.put("&iuml;",  "&#239;");
431         CODED_ENTITIES.put("&eth;",   "&#240;");
432         CODED_ENTITIES.put("&ntilde;","&#241;");
433         CODED_ENTITIES.put("&ograve;","&#242;");
434         CODED_ENTITIES.put("&oacute;","&#243;");
435         CODED_ENTITIES.put("&ocirc;", "&#244;");
436         CODED_ENTITIES.put("&otilde;","&#245;");
437         CODED_ENTITIES.put("&ouml;",  "&#246;");
438         CODED_ENTITIES.put("&divide;","&#247;");
439         CODED_ENTITIES.put("&oslash;","&#248;");
440         CODED_ENTITIES.put("&ugrave;","&#249;");
441         CODED_ENTITIES.put("&uacute;","&#250;");
442         CODED_ENTITIES.put("&ucirc;", "&#251;");
443         CODED_ENTITIES.put("&uuml;",  "&#252;");
444         CODED_ENTITIES.put("&yacute;","&#253;");
445         CODED_ENTITIES.put("&thorn;", "&#254;");
446         CODED_ENTITIES.put("&yuml;",  "&#255;");
447 
448         // Mathematical, Greek and Symbolic characters for HTML.
449         // HTMLsymbol "-//W3C//ENTITIES Symbols//EN//HTML"
450 
451         CODED_ENTITIES.put("&fnof;",     "&#402;");
452         CODED_ENTITIES.put("&Alpha;",    "&#913;");
453         CODED_ENTITIES.put("&Beta;",     "&#914;");
454         CODED_ENTITIES.put("&Gamma;",    "&#915;");
455         CODED_ENTITIES.put("&Delta;",    "&#916;");
456         CODED_ENTITIES.put("&Epsilon;",  "&#917;");
457         CODED_ENTITIES.put("&Zeta;",     "&#918;");
458         CODED_ENTITIES.put("&Eta;",      "&#919;");
459         CODED_ENTITIES.put("&Theta;",    "&#920;");
460         CODED_ENTITIES.put("&Iota;",     "&#921;");
461         CODED_ENTITIES.put("&Kappa;",    "&#922;");
462         CODED_ENTITIES.put("&Lambda;",   "&#923;");
463         CODED_ENTITIES.put("&Mu;",       "&#924;");
464         CODED_ENTITIES.put("&Nu;",       "&#925;");
465         CODED_ENTITIES.put("&Xi;",       "&#926;");
466         CODED_ENTITIES.put("&Omicron;",  "&#927;");
467         CODED_ENTITIES.put("&Pi;",       "&#928;");
468         CODED_ENTITIES.put("&Rho;",      "&#929;");
469         CODED_ENTITIES.put("&Sigma;",    "&#931;");
470         CODED_ENTITIES.put("&Tau;",      "&#932;");
471         CODED_ENTITIES.put("&Upsilon;",  "&#933;");
472         CODED_ENTITIES.put("&Phi;",      "&#934;");
473         CODED_ENTITIES.put("&Chi;",      "&#935;");
474         CODED_ENTITIES.put("&Psi;",      "&#936;");
475         CODED_ENTITIES.put("&Omega;",    "&#937;");
476         CODED_ENTITIES.put("&alpha;",    "&#945;");
477         CODED_ENTITIES.put("&beta;",     "&#946;");
478         CODED_ENTITIES.put("&gamma;",    "&#947;");
479         CODED_ENTITIES.put("&delta;",    "&#948;");
480         CODED_ENTITIES.put("&epsilon;",  "&#949;");
481         CODED_ENTITIES.put("&zeta;",     "&#950;");
482         CODED_ENTITIES.put("&eta;",      "&#951;");
483         CODED_ENTITIES.put("&theta;",    "&#952;");
484         CODED_ENTITIES.put("&iota;",     "&#953;");
485         CODED_ENTITIES.put("&kappa;",    "&#954;");
486         CODED_ENTITIES.put("&lambda;",   "&#955;");
487         CODED_ENTITIES.put("&mu;",       "&#956;");
488         CODED_ENTITIES.put("&nu;",       "&#957;");
489         CODED_ENTITIES.put("&xi;",       "&#958;");
490         CODED_ENTITIES.put("&omicron;",  "&#959;");
491         CODED_ENTITIES.put("&pi;",       "&#960;");
492         CODED_ENTITIES.put("&rho;",      "&#961;");
493         CODED_ENTITIES.put("&sigmaf;",   "&#962;");
494         CODED_ENTITIES.put("&sigma;",    "&#963;");
495         CODED_ENTITIES.put("&tau;",      "&#964;");
496         CODED_ENTITIES.put("&upsilon;",  "&#965;");
497         CODED_ENTITIES.put("&phi;",      "&#966;");
498         CODED_ENTITIES.put("&chi;",      "&#967;");
499         CODED_ENTITIES.put("&psi;",      "&#968;");
500         CODED_ENTITIES.put("&omega;",    "&#969;");
501         CODED_ENTITIES.put("&thetasym;", "&#977;");
502         CODED_ENTITIES.put("&upsih;",    "&#978;");
503         CODED_ENTITIES.put("&piv;",      "&#982;");
504         CODED_ENTITIES.put("&bull;",     "&#8226;");
505         CODED_ENTITIES.put("&hellip;",   "&#8230;");
506         CODED_ENTITIES.put("&prime;",    "&#8242;");
507         CODED_ENTITIES.put("&Prime;",    "&#8243;");
508         CODED_ENTITIES.put("&oline;",    "&#8254;");
509         CODED_ENTITIES.put("&frasl;",    "&#8260;");
510         CODED_ENTITIES.put("&weierp;",   "&#8472;");
511         CODED_ENTITIES.put("&image;",    "&#8465;");
512         CODED_ENTITIES.put("&real;",     "&#8476;");
513         CODED_ENTITIES.put("&trade;",    "&#8482;");
514         CODED_ENTITIES.put("&alefsym;",  "&#8501;");
515         CODED_ENTITIES.put("&larr;",     "&#8592;");
516         CODED_ENTITIES.put("&uarr;",     "&#8593;");
517         CODED_ENTITIES.put("&rarr;",     "&#8594;");
518         CODED_ENTITIES.put("&darr;",     "&#8595;");
519         CODED_ENTITIES.put("&harr;",     "&#8596;");
520         CODED_ENTITIES.put("&crarr;",    "&#8629;");
521         CODED_ENTITIES.put("&lArr;",     "&#8656;");
522         CODED_ENTITIES.put("&uArr;",     "&#8657;");
523         CODED_ENTITIES.put("&rArr;",     "&#8658;");
524         CODED_ENTITIES.put("&dArr;",     "&#8659;");
525         CODED_ENTITIES.put("&hArr;",     "&#8660;");
526         CODED_ENTITIES.put("&forall;",   "&#8704;");
527         CODED_ENTITIES.put("&part;",     "&#8706;");
528         CODED_ENTITIES.put("&exist;",    "&#8707;");
529         CODED_ENTITIES.put("&empty;",    "&#8709;");
530         CODED_ENTITIES.put("&nabla;",    "&#8711;");
531         CODED_ENTITIES.put("&isin;",     "&#8712;");
532         CODED_ENTITIES.put("&notin;",    "&#8713;");
533         CODED_ENTITIES.put("&ni;",       "&#8715;");
534         CODED_ENTITIES.put("&prod;",     "&#8719;");
535         CODED_ENTITIES.put("&sum;",      "&#8721;");
536         CODED_ENTITIES.put("&minus;",    "&#8722;");
537         CODED_ENTITIES.put("&lowast;",   "&#8727;");
538         CODED_ENTITIES.put("&radic;",    "&#8730;");
539         CODED_ENTITIES.put("&prop;",     "&#8733;");
540         CODED_ENTITIES.put("&infin;",    "&#8734;");
541         CODED_ENTITIES.put("&ang;",      "&#8736;");
542         CODED_ENTITIES.put("&and;",      "&#8743;");
543         CODED_ENTITIES.put("&or;",       "&#8744;");
544         CODED_ENTITIES.put("&cap;",      "&#8745;");
545         CODED_ENTITIES.put("&cup;",      "&#8746;");
546         CODED_ENTITIES.put("&int;",      "&#8747;");
547         CODED_ENTITIES.put("&there4;",   "&#8756;");
548         CODED_ENTITIES.put("&sim;",      "&#8764;");
549         CODED_ENTITIES.put("&cong;",     "&#8773;");
550         CODED_ENTITIES.put("&asymp;",    "&#8776;");
551         CODED_ENTITIES.put("&ne;",       "&#8800;");
552         CODED_ENTITIES.put("&equiv;",    "&#8801;");
553         CODED_ENTITIES.put("&le;",       "&#8804;");
554         CODED_ENTITIES.put("&ge;",       "&#8805;");
555         CODED_ENTITIES.put("&sub;",      "&#8834;");
556         CODED_ENTITIES.put("&sup;",      "&#8835;");
557         CODED_ENTITIES.put("&nsub;",     "&#8836;");
558         CODED_ENTITIES.put("&sube;",     "&#8838;");
559         CODED_ENTITIES.put("&supe;",     "&#8839;");
560         CODED_ENTITIES.put("&oplus;",    "&#8853;");
561         CODED_ENTITIES.put("&otimes;",   "&#8855;");
562         CODED_ENTITIES.put("&perp;",     "&#8869;");
563         CODED_ENTITIES.put("&sdot;",     "&#8901;");
564         CODED_ENTITIES.put("&lceil;",    "&#8968;");
565         CODED_ENTITIES.put("&rceil;",    "&#8969;");
566         CODED_ENTITIES.put("&lfloor;",   "&#8970;");
567         CODED_ENTITIES.put("&rfloor;",   "&#8971;");
568         CODED_ENTITIES.put("&lang;",     "&#9001;");
569         CODED_ENTITIES.put("&rang;",     "&#9002;");
570         CODED_ENTITIES.put("&loz;",      "&#9674;");
571         CODED_ENTITIES.put("&spades;",   "&#9824;");
572         CODED_ENTITIES.put("&clubs;",    "&#9827;");
573         CODED_ENTITIES.put("&hearts;",   "&#9829;");
574         CODED_ENTITIES.put("&diams;",    "&#9830;");
575 
576        // Special characters for HTML.
577        // HTMLspecial "-//W3C//ENTITIES Special//EN//HTML"
578 
579         CODED_ENTITIES.put("&quot;",      "&#34;");
580         CODED_ENTITIES.put("&amp;",       "&#38;");
581         CODED_ENTITIES.put("&lt;",        "&#60;");
582         CODED_ENTITIES.put("&gt;",        "&#62;");
583         CODED_ENTITIES.put("&OElig;",     "&#338;");
584         CODED_ENTITIES.put("&oelig;",     "&#339;");
585         CODED_ENTITIES.put("&Scaron;",    "&#352;");
586         CODED_ENTITIES.put("&scaron;",    "&#353;");
587         CODED_ENTITIES.put("&Yuml;",      "&#376;");
588         CODED_ENTITIES.put("&circ;",      "&#710;");
589         CODED_ENTITIES.put("&tilde;",     "&#732;");
590         CODED_ENTITIES.put("&ensp;",      "&#8194;");
591         CODED_ENTITIES.put("&emsp;",      "&#8195;");
592         CODED_ENTITIES.put("&thinsp;",    "&#8201;");
593         CODED_ENTITIES.put("&zwnj;",      "&#8204;");
594         CODED_ENTITIES.put("&zwj;",       "&#8205;");
595         CODED_ENTITIES.put("&lrm;",       "&#8206;");
596         CODED_ENTITIES.put("&rlm;",       "&#8207;");
597         CODED_ENTITIES.put("&ndash;",     "&#8211;");
598         CODED_ENTITIES.put("&mdash;",     "&#8212;");
599         CODED_ENTITIES.put("&lsquo;",     "&#8216;");
600         CODED_ENTITIES.put("&rsquo;",     "&#8217;");
601         CODED_ENTITIES.put("&sbquo;",     "&#8218;");
602         CODED_ENTITIES.put("&ldquo;",     "&#8220;");
603         CODED_ENTITIES.put("&rdquo;",     "&#8221;");
604         CODED_ENTITIES.put("&bdquo;",     "&#8222;");
605         CODED_ENTITIES.put("&dagger;",    "&#8224;");
606         CODED_ENTITIES.put("&Dagger;",    "&#8225;");
607         CODED_ENTITIES.put("&permil;",    "&#8240;");
608         CODED_ENTITIES.put("&lsaquo;",    "&#8249;");
609         CODED_ENTITIES.put("&rsaquo;",    "&#8250;");
610         CODED_ENTITIES.put("&euro;",      "&#8364;");
611     }
612 
613     //
614     // It shouldn't be here but well, just reusing the CODED_ENTITIES Map :)
615     //
616 
617     private static Pattern ENTITIES_PATTERN = Pattern.compile( "&[A-Za-z^#]+;" );
618 
619 
620     public String processHtmlEntities(String s) {
621         if (s.indexOf('&')==-1) {
622             return s;
623         }
624         StringBuffer sb = new StringBuffer(s.length());
625         int pos = 0;
626         while (pos<s.length()) {
627             String chunck = s.substring(pos);
628             Matcher m = ENTITIES_PATTERN.matcher(chunck);
629             if (m.find()) {
630                 int b = pos + m.start();
631                 int e = pos + m.end();
632                 if (b>pos) {
633                     sb.append(s.substring(pos,b));
634                     pos = b;
635                 }
636                 chunck = s.substring(pos,e);
637                 String codedEntity = (String) CODED_ENTITIES.get(chunck);
638                 if (codedEntity==null) {
639                     codedEntity = chunck;
640                 }
641                 sb.append(codedEntity);
642                 pos = e;
643             }
644             else {
645                 sb.append(chunck);
646                 pos += chunck.length();
647             }
648         }
649         return sb.toString();
650     }
651 
652 }