View Javadoc

1   /*
2    * Copyright 2005 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.io.impl;
18  
19  import java.io.IOException;
20  import java.io.Reader;
21  import java.io.InputStreamReader;
22  import java.io.BufferedReader;
23  import java.util.HashMap;
24  import java.util.Map;
25  import java.util.regex.Pattern;
26  import java.util.regex.Matcher;
27  import java.net.URL;
28  
29  /***
30   * @author Alejandro Abdelnur
31   */
32  public class XmlFixerReader extends Reader {
33  
34      public static void main(String[] args) throws Exception {
35          Reader r = new InputStreamReader(new URL(args[0]).openStream());
36          r = new XmlFixerReader(r);
37          BufferedReader br = new BufferedReader(r);
38          String l = br.readLine();
39          while (l!=null) {
40              System.out.println(l);
41              l = br.readLine();
42          }
43      }
44  
45      protected Reader in;
46  
47      public XmlFixerReader(Reader in) {
48          super(in);
49          this.in = in;
50          _buffer = new StringBuffer();
51          _state = 0;
52      }
53  
54      private boolean trimmed;
55      private StringBuffer _buffer;
56      private int _bufferPos;
57      private int _state = 0;
58  
59      private boolean trimStream() throws IOException {
60          boolean hasContent = true;
61          int state = 0;
62          boolean loop;
63          int c;
64          do {
65              switch (state) {
66                  case 0:
67                      c = in.read();
68                      if (c==-1) {
69                          loop = false;
70                          hasContent = false;
71                      }
72                      else
73                      if (c==' ' || c=='\n') {
74                          loop = true;
75                      }
76                      else
77                      if (c=='<') {
78                          state = 1;
79                          _buffer.setLength(0);
80                          _bufferPos = 0;
81                          _buffer.append((char)c);
82                          loop = true;
83                      }
84                      else {
85                          _buffer.setLength(0);
86                          _bufferPos = 0;
87                          _buffer.append((char)c);
88                          loop = false;
89                          hasContent = true;
90                          _state = 3;
91                      }
92                      break;
93                  case 1:
94                      c = in.read();
95                      if (c==-1) {
96                          loop = false;
97                          hasContent = true;
98                          _state = 3;
99                      }
100                     else
101                     if (c!='!') {
102                         _buffer.append((char)c);
103                         _state = 3;
104                         loop = false;
105                         hasContent = true;
106                         _state = 3;
107                     }
108                     else {
109                         _buffer.append((char)c);
110                         state = 2;
111                         loop = true;
112                     }
113                     break;
114                 case 2:
115                     c = in.read();
116                     if (c==-1) {
117                         loop = false;
118                         hasContent = true;
119                         _state = 3;
120                     }
121                     else
122                     if (c=='-') {
123                         _buffer.append((char)c);
124                         state = 3;
125                         loop = true;
126                     }
127                     else {
128                         _buffer.append((char)c);
129                         loop = false;
130                         hasContent = true;
131                         _state = 3;
132                     }
133                     break;
134                 case 3:
135                     c = in.read();
136                     if (c==-1) {
137                         loop = false;
138                         hasContent = true;
139                         _state = 3;
140                     }
141                     else
142                     if (c=='-') {
143                         _buffer.append((char)c);
144                         state = 4;
145                         loop = true;
146                     }
147                     else {
148                         _buffer.append((char)c);
149                         loop = false;
150                         hasContent = true;
151                         _state = 3;
152                     }
153                     break;
154                 case 4:
155                     c = in.read();
156                     if (c==-1) {
157                         loop = false;
158                         hasContent = true;
159                         _state = 3;
160                     }
161                     else
162                     if (c!='-') {
163                         _buffer.append((char)c);
164                         loop = true;
165                     }
166                     else {
167                         _buffer.append((char)c);
168                         state = 5;
169                         loop = true;
170                     }
171                     break;
172                 case 5:
173                     c = in.read();
174                     if (c==-1) {
175                         loop = false;
176                         hasContent = true;
177                         _state = 3;
178                     }
179                     else
180                     if (c!='-') {
181                         _buffer.append((char)c);
182                         loop = true;
183                         state = 4;
184                     }
185                     else {
186                         _buffer.append((char)c);
187                         state = 6;
188                         loop = true;
189                     }
190                     break;
191                 case 6:
192                     c = in.read();
193                     if (c==-1) {
194                         loop = false;
195                         hasContent = true;
196                         _state = 3;
197                     }
198                     else
199                     if (c!='>') {
200                         _buffer.append((char)c);
201                         loop = true;
202                         state = 4;
203                     }
204                     else {
205                         _buffer.setLength(0);
206                         state = 0;
207                         loop = true;
208                     }
209                     break;
210                 default:
211                     throw new IOException("It shouldn't happen");
212             }
213         } while (loop);
214         return hasContent;
215     }
216 
217     public int read() throws IOException {
218         boolean loop;
219         if (!trimmed) { // trims XML stream
220             trimmed = true;
221             if (!trimStream()) {
222                 return -1;
223             }
224         }
225         int c;
226         do { // converts literal entities to coded entities
227             switch (_state) {
228                 case 0: // reading chars from stream
229                     c = in.read();
230                     if (c>-1) {
231                         if (c=='&') {
232                             _state = 1;
233                             _buffer.setLength(0);
234                             _bufferPos = 0;
235                             _buffer.append((char)c);
236                             _state = 1;
237                             loop = true;
238                         }
239                         else {
240                             loop = false;
241                         }
242                     }
243                     else {
244                         loop = false;
245                     }
246                     break;
247                 case 1: // reading entity from stream
248                     c = in.read();
249                     if (c>-1) {
250                         if (c==';') {
251                             _buffer.append((char)c);
252                             _state = 2;
253                             loop = true;
254                         }
255                         else
256                         if ((c>='a' && c<='z') || (c>='A' && c<='Z') || (c=='#') || (c>='0' && c<='9')) {
257                             _buffer.append((char)c);
258                             loop = true;
259                         }
260                         else {
261                             _buffer.append((char)c);
262                             _state = 3;
263                             loop = true;
264                         }
265                     }
266                     else {
267                         _state = 3;
268                         loop = true;
269                     }
270                     break;
271                 case 2: // replacing entity
272                     c = 0;
273                     String literalEntity = _buffer.toString();
274                     String codedEntity = (String) CODED_ENTITIES.get(literalEntity);
275                     if (codedEntity!=null) {
276                         _buffer.setLength(0);
277                         _buffer.append(codedEntity);
278                     } // else we leave what was in the stream
279                     _state = 3;
280                     loop = true;
281                     break;
282                 case 3: // consuming buffer
283                     if (_bufferPos<_buffer.length()) {
284                         c = _buffer.charAt(_bufferPos++);
285                         loop = false;
286                     }
287                     else {
288                         c = 0;
289                         _state = 0;
290                         loop = true;
291                     }
292                     break;
293                  default:
294                     throw new IOException("It shouldn't happen");
295             }
296         } while (loop);
297         return c;
298     }
299 
300     public int read(char[] buffer,int offset,int len) throws IOException {
301         int charsRead = 0;
302         int c = read();
303         if (c==-1) {
304             return -1;
305         }
306         buffer[offset+(charsRead++)] = (char) c;
307         while (charsRead<len && (c=read())>-1) {
308             buffer[offset+(charsRead++)] = (char) c;
309         }
310         return charsRead;
311     }
312 
313     public long skip(long n) throws IOException {
314         if (n==0) {
315             return 0;
316         }
317         else
318         if (n<0) {
319             throw new IllegalArgumentException("'n' cannot be negative");
320         }
321         int c = read();
322         long counter = 1;
323         while (c>-1 && counter<n) {
324             c = read();
325             counter++;
326         }
327         return counter;
328     }
329 
330     public boolean ready() throws IOException {
331         return (_state!=0) || in.ready();
332     }
333 
334     public boolean markSupported() {
335         return false;
336     }
337 
338     public void mark(int readAheadLimit) throws IOException {
339         throw new IOException("Stream does not support mark");
340     }
341 
342     public void reset() throws IOException {
343         throw new IOException("Stream does not support mark");
344     }
345 
346     public void close() throws IOException {
347         in.close();
348     }
349 
350     private static Map CODED_ENTITIES = new HashMap();
351 
352     static {
353         CODED_ENTITIES.put("&nbsp;",  "&#160;");
354         CODED_ENTITIES.put("&iexcl;", "&#161;");
355         CODED_ENTITIES.put("&cent;",  "&#162;");
356         CODED_ENTITIES.put("&pound;", "&#163;");
357         CODED_ENTITIES.put("&curren;","&#164;");
358         CODED_ENTITIES.put("&yen;",   "&#165;");
359         CODED_ENTITIES.put("&brvbar;","&#166;");
360         CODED_ENTITIES.put("&sect;",  "&#167;");
361         CODED_ENTITIES.put("&uml;",   "&#168;");
362         CODED_ENTITIES.put("&copy;",  "&#169;");
363         CODED_ENTITIES.put("&ordf;",  "&#170;");
364         CODED_ENTITIES.put("&laquo;", "&#171;");
365         CODED_ENTITIES.put("&not;",   "&#172;");
366         CODED_ENTITIES.put("&shy;",   "&#173;");
367         CODED_ENTITIES.put("&reg;",   "&#174;");
368         CODED_ENTITIES.put("&macr;",  "&#175;");
369         CODED_ENTITIES.put("&deg;",   "&#176;");
370         CODED_ENTITIES.put("&plusmn;","&#177;");
371         CODED_ENTITIES.put("&sup2;",  "&#178;");
372         CODED_ENTITIES.put("&sup3;",  "&#179;");
373         CODED_ENTITIES.put("&acute;", "&#180;");
374         CODED_ENTITIES.put("&micro;", "&#181;");
375         CODED_ENTITIES.put("&para;",  "&#182;");
376         CODED_ENTITIES.put("&middot;","&#183;");
377         CODED_ENTITIES.put("&cedil;", "&#184;");
378         CODED_ENTITIES.put("&sup1;",  "&#185;");
379         CODED_ENTITIES.put("&ordm;",  "&#186;");
380         CODED_ENTITIES.put("&raquo;", "&#187;");
381         CODED_ENTITIES.put("&frac14;","&#188;");
382         CODED_ENTITIES.put("&frac12;","&#189;");
383         CODED_ENTITIES.put("&frac34;","&#190;");
384         CODED_ENTITIES.put("&iquest;","&#191;");
385         CODED_ENTITIES.put("&Agrave;","&#192;");
386         CODED_ENTITIES.put("&Aacute;","&#193;");
387         CODED_ENTITIES.put("&Acirc;", "&#194;");
388         CODED_ENTITIES.put("&Atilde;","&#195;");
389         CODED_ENTITIES.put("&Auml;",  "&#196;");
390         CODED_ENTITIES.put("&Aring;", "&#197;");
391         CODED_ENTITIES.put("&AElig;", "&#198;");
392         CODED_ENTITIES.put("&Ccedil;","&#199;");
393         CODED_ENTITIES.put("&Egrave;","&#200;");
394         CODED_ENTITIES.put("&Eacute;","&#201;");
395         CODED_ENTITIES.put("&Ecirc;", "&#202;");
396         CODED_ENTITIES.put("&Euml;",  "&#203;");
397         CODED_ENTITIES.put("&Igrave;","&#204;");
398         CODED_ENTITIES.put("&Iacute;","&#205;");
399         CODED_ENTITIES.put("&Icirc;", "&#206;");
400         CODED_ENTITIES.put("&Iuml;",  "&#207;");
401         CODED_ENTITIES.put("&ETH;",   "&#208;");
402         CODED_ENTITIES.put("&Ntilde;","&#209;");
403         CODED_ENTITIES.put("&Ograve;","&#210;");
404         CODED_ENTITIES.put("&Oacute;","&#211;");
405         CODED_ENTITIES.put("&Ocirc;", "&#212;");
406         CODED_ENTITIES.put("&Otilde;","&#213;");
407         CODED_ENTITIES.put("&Ouml;",  "&#214;");
408         CODED_ENTITIES.put("&times;", "&#215;");
409         CODED_ENTITIES.put("&Oslash;","&#216;");
410         CODED_ENTITIES.put("&Ugrave;","&#217;");
411         CODED_ENTITIES.put("&Uacute;","&#218;");
412         CODED_ENTITIES.put("&Ucirc;", "&#219;");
413         CODED_ENTITIES.put("&Uuml;",  "&#220;");
414         CODED_ENTITIES.put("&Yacute;","&#221;");
415         CODED_ENTITIES.put("&THORN;", "&#222;");
416         CODED_ENTITIES.put("&szlig;", "&#223;");
417         CODED_ENTITIES.put("&agrave;","&#224;");
418         CODED_ENTITIES.put("&aacute;","&#225;");
419         CODED_ENTITIES.put("&acirc;", "&#226;");
420         CODED_ENTITIES.put("&atilde;","&#227;");
421         CODED_ENTITIES.put("&auml;",  "&#228;");
422         CODED_ENTITIES.put("&aring;", "&#229;");
423         CODED_ENTITIES.put("&aelig;", "&#230;");
424         CODED_ENTITIES.put("&ccedil;","&#231;");
425         CODED_ENTITIES.put("&egrave;","&#232;");
426         CODED_ENTITIES.put("&eacute;","&#233;");
427         CODED_ENTITIES.put("&ecirc;", "&#234;");
428         CODED_ENTITIES.put("&euml;",  "&#235;");
429         CODED_ENTITIES.put("&igrave;","&#236;");
430         CODED_ENTITIES.put("&iacute;","&#237;");
431         CODED_ENTITIES.put("&icirc;", "&#238;");
432         CODED_ENTITIES.put("&iuml;",  "&#239;");
433         CODED_ENTITIES.put("&eth;",   "&#240;");
434         CODED_ENTITIES.put("&ntilde;","&#241;");
435         CODED_ENTITIES.put("&ograve;","&#242;");
436         CODED_ENTITIES.put("&oacute;","&#243;");
437         CODED_ENTITIES.put("&ocirc;", "&#244;");
438         CODED_ENTITIES.put("&otilde;","&#245;");
439         CODED_ENTITIES.put("&ouml;",  "&#246;");
440         CODED_ENTITIES.put("&divide;","&#247;");
441         CODED_ENTITIES.put("&oslash;","&#248;");
442         CODED_ENTITIES.put("&ugrave;","&#249;");
443         CODED_ENTITIES.put("&uacute;","&#250;");
444         CODED_ENTITIES.put("&ucirc;", "&#251;");
445         CODED_ENTITIES.put("&uuml;",  "&#252;");
446         CODED_ENTITIES.put("&yacute;","&#253;");
447         CODED_ENTITIES.put("&thorn;", "&#254;");
448         CODED_ENTITIES.put("&yuml;",  "&#255;");
449     }
450 
451     //
452     // It shouldn't be here but well, just reusing the CODED_ENTITIES Map :)
453     //
454 
455     private static Pattern ENTITIES_PATTERN = Pattern.compile( "&[A-Za-z^#]+;" );
456 
457 
458     public String processHtmlEntities(String s) {
459         if (s.indexOf('&')==-1) {
460             return s;
461         }
462         StringBuffer sb = new StringBuffer(s.length());
463         int pos = 0;
464         while (pos<s.length()) {
465             String chunck = s.substring(pos);
466             Matcher m = ENTITIES_PATTERN.matcher(chunck);
467             if (m.find()) {
468                 int b = pos + m.start();
469                 int e = pos + m.end();
470                 if (b>pos) {
471                     sb.append(s.substring(pos,b));
472                     pos = b;
473                 }
474                 chunck = s.substring(pos,e);
475                 String codedEntity = (String) CODED_ENTITIES.get(chunck);
476                 if (codedEntity==null) {
477                     codedEntity = chunck;
478                 }
479                 sb.append(codedEntity);
480                 pos = e;
481             }
482             else {
483                 sb.append(chunck);
484                 pos += chunck.length();
485             }
486         }
487         return sb.toString();
488     }
489 
490 }