1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package com.sun.syndication.io.impl;
18
19 import java.io.IOException;
20 import java.io.Reader;
21 import java.io.InputStreamReader;
22 import java.io.BufferedReader;
23 import java.util.HashMap;
24 import java.util.Map;
25 import java.util.regex.Pattern;
26 import java.util.regex.Matcher;
27 import java.net.URL;
28
29 /***
30 * @author Alejandro Abdelnur
31 */
32 public class XmlFixerReader extends Reader {
33
34 public static void main(String[] args) throws Exception {
35 Reader r = new InputStreamReader(new URL(args[0]).openStream());
36 r = new XmlFixerReader(r);
37 BufferedReader br = new BufferedReader(r);
38 String l = br.readLine();
39 while (l!=null) {
40 System.out.println(l);
41 l = br.readLine();
42 }
43 }
44
45 protected Reader in;
46
47 public XmlFixerReader(Reader in) {
48 super(in);
49 this.in = in;
50 _buffer = new StringBuffer();
51 _state = 0;
52 }
53
54 private boolean trimmed;
55 private StringBuffer _buffer;
56 private int _bufferPos;
57 private int _state = 0;
58
59 private boolean trimStream() throws IOException {
60 boolean hasContent = true;
61 int state = 0;
62 boolean loop;
63 int c;
64 do {
65 switch (state) {
66 case 0:
67 c = in.read();
68 if (c==-1) {
69 loop = false;
70 hasContent = false;
71 }
72 else
73 if (c==' ' || c=='\n') {
74 loop = true;
75 }
76 else
77 if (c=='<') {
78 state = 1;
79 _buffer.setLength(0);
80 _bufferPos = 0;
81 _buffer.append((char)c);
82 loop = true;
83 }
84 else {
85 _buffer.setLength(0);
86 _bufferPos = 0;
87 _buffer.append((char)c);
88 loop = false;
89 hasContent = true;
90 _state = 3;
91 }
92 break;
93 case 1:
94 c = in.read();
95 if (c==-1) {
96 loop = false;
97 hasContent = true;
98 _state = 3;
99 }
100 else
101 if (c!='!') {
102 _buffer.append((char)c);
103 _state = 3;
104 loop = false;
105 hasContent = true;
106 _state = 3;
107 }
108 else {
109 _buffer.append((char)c);
110 state = 2;
111 loop = true;
112 }
113 break;
114 case 2:
115 c = in.read();
116 if (c==-1) {
117 loop = false;
118 hasContent = true;
119 _state = 3;
120 }
121 else
122 if (c=='-') {
123 _buffer.append((char)c);
124 state = 3;
125 loop = true;
126 }
127 else {
128 _buffer.append((char)c);
129 loop = false;
130 hasContent = true;
131 _state = 3;
132 }
133 break;
134 case 3:
135 c = in.read();
136 if (c==-1) {
137 loop = false;
138 hasContent = true;
139 _state = 3;
140 }
141 else
142 if (c=='-') {
143 _buffer.append((char)c);
144 state = 4;
145 loop = true;
146 }
147 else {
148 _buffer.append((char)c);
149 loop = false;
150 hasContent = true;
151 _state = 3;
152 }
153 break;
154 case 4:
155 c = in.read();
156 if (c==-1) {
157 loop = false;
158 hasContent = true;
159 _state = 3;
160 }
161 else
162 if (c!='-') {
163 _buffer.append((char)c);
164 loop = true;
165 }
166 else {
167 _buffer.append((char)c);
168 state = 5;
169 loop = true;
170 }
171 break;
172 case 5:
173 c = in.read();
174 if (c==-1) {
175 loop = false;
176 hasContent = true;
177 _state = 3;
178 }
179 else
180 if (c!='-') {
181 _buffer.append((char)c);
182 loop = true;
183 state = 4;
184 }
185 else {
186 _buffer.append((char)c);
187 state = 6;
188 loop = true;
189 }
190 break;
191 case 6:
192 c = in.read();
193 if (c==-1) {
194 loop = false;
195 hasContent = true;
196 _state = 3;
197 }
198 else
199 if (c!='>') {
200 _buffer.append((char)c);
201 loop = true;
202 state = 4;
203 }
204 else {
205 _buffer.setLength(0);
206 state = 0;
207 loop = true;
208 }
209 break;
210 default:
211 throw new IOException("It shouldn't happen");
212 }
213 } while (loop);
214 return hasContent;
215 }
216
217 public int read() throws IOException {
218 boolean loop;
219 if (!trimmed) {
220 trimmed = true;
221 if (!trimStream()) {
222 return -1;
223 }
224 }
225 int c;
226 do {
227 switch (_state) {
228 case 0:
229 c = in.read();
230 if (c>-1) {
231 if (c=='&') {
232 _state = 1;
233 _buffer.setLength(0);
234 _bufferPos = 0;
235 _buffer.append((char)c);
236 _state = 1;
237 loop = true;
238 }
239 else {
240 loop = false;
241 }
242 }
243 else {
244 loop = false;
245 }
246 break;
247 case 1:
248 c = in.read();
249 if (c>-1) {
250 if (c==';') {
251 _buffer.append((char)c);
252 _state = 2;
253 loop = true;
254 }
255 else
256 if ((c>='a' && c<='z') || (c>='A' && c<='Z') || (c=='#') || (c>='0' && c<='9')) {
257 _buffer.append((char)c);
258 loop = true;
259 }
260 else {
261 _buffer.append((char)c);
262 _state = 3;
263 loop = true;
264 }
265 }
266 else {
267 _state = 3;
268 loop = true;
269 }
270 break;
271 case 2:
272 c = 0;
273 String literalEntity = _buffer.toString();
274 String codedEntity = (String) CODED_ENTITIES.get(literalEntity);
275 if (codedEntity!=null) {
276 _buffer.setLength(0);
277 _buffer.append(codedEntity);
278 }
279 _state = 3;
280 loop = true;
281 break;
282 case 3:
283 if (_bufferPos<_buffer.length()) {
284 c = _buffer.charAt(_bufferPos++);
285 loop = false;
286 }
287 else {
288 c = 0;
289 _state = 0;
290 loop = true;
291 }
292 break;
293 default:
294 throw new IOException("It shouldn't happen");
295 }
296 } while (loop);
297 return c;
298 }
299
300 public int read(char[] buffer,int offset,int len) throws IOException {
301 int charsRead = 0;
302 int c = read();
303 if (c==-1) {
304 return -1;
305 }
306 buffer[offset+(charsRead++)] = (char) c;
307 while (charsRead<len && (c=read())>-1) {
308 buffer[offset+(charsRead++)] = (char) c;
309 }
310 return charsRead;
311 }
312
313 public long skip(long n) throws IOException {
314 if (n==0) {
315 return 0;
316 }
317 else
318 if (n<0) {
319 throw new IllegalArgumentException("'n' cannot be negative");
320 }
321 int c = read();
322 long counter = 1;
323 while (c>-1 && counter<n) {
324 c = read();
325 counter++;
326 }
327 return counter;
328 }
329
330 public boolean ready() throws IOException {
331 return (_state!=0) || in.ready();
332 }
333
334 public boolean markSupported() {
335 return false;
336 }
337
338 public void mark(int readAheadLimit) throws IOException {
339 throw new IOException("Stream does not support mark");
340 }
341
342 public void reset() throws IOException {
343 throw new IOException("Stream does not support mark");
344 }
345
346 public void close() throws IOException {
347 in.close();
348 }
349
350 private static Map CODED_ENTITIES = new HashMap();
351
352 static {
353 CODED_ENTITIES.put(" ", " ");
354 CODED_ENTITIES.put("¡", "¡");
355 CODED_ENTITIES.put("¢", "¢");
356 CODED_ENTITIES.put("£", "£");
357 CODED_ENTITIES.put("¤","¤");
358 CODED_ENTITIES.put("¥", "¥");
359 CODED_ENTITIES.put("¦","¦");
360 CODED_ENTITIES.put("§", "§");
361 CODED_ENTITIES.put("¨", "¨");
362 CODED_ENTITIES.put("©", "©");
363 CODED_ENTITIES.put("ª", "ª");
364 CODED_ENTITIES.put("«", "«");
365 CODED_ENTITIES.put("¬", "¬");
366 CODED_ENTITIES.put("­", "­");
367 CODED_ENTITIES.put("®", "®");
368 CODED_ENTITIES.put("¯", "¯");
369 CODED_ENTITIES.put("°", "°");
370 CODED_ENTITIES.put("±","±");
371 CODED_ENTITIES.put("²", "²");
372 CODED_ENTITIES.put("³", "³");
373 CODED_ENTITIES.put("´", "´");
374 CODED_ENTITIES.put("µ", "µ");
375 CODED_ENTITIES.put("¶", "¶");
376 CODED_ENTITIES.put("·","·");
377 CODED_ENTITIES.put("¸", "¸");
378 CODED_ENTITIES.put("¹", "¹");
379 CODED_ENTITIES.put("º", "º");
380 CODED_ENTITIES.put("»", "»");
381 CODED_ENTITIES.put("¼","¼");
382 CODED_ENTITIES.put("½","½");
383 CODED_ENTITIES.put("¾","¾");
384 CODED_ENTITIES.put("¿","¿");
385 CODED_ENTITIES.put("À","À");
386 CODED_ENTITIES.put("Á","Á");
387 CODED_ENTITIES.put("Â", "Â");
388 CODED_ENTITIES.put("Ã","Ã");
389 CODED_ENTITIES.put("Ä", "Ä");
390 CODED_ENTITIES.put("Å", "Å");
391 CODED_ENTITIES.put("Æ", "Æ");
392 CODED_ENTITIES.put("Ç","Ç");
393 CODED_ENTITIES.put("È","È");
394 CODED_ENTITIES.put("É","É");
395 CODED_ENTITIES.put("Ê", "Ê");
396 CODED_ENTITIES.put("Ë", "Ë");
397 CODED_ENTITIES.put("Ì","Ì");
398 CODED_ENTITIES.put("Í","Í");
399 CODED_ENTITIES.put("Î", "Î");
400 CODED_ENTITIES.put("Ï", "Ï");
401 CODED_ENTITIES.put("Ð", "Ð");
402 CODED_ENTITIES.put("Ñ","Ñ");
403 CODED_ENTITIES.put("Ò","Ò");
404 CODED_ENTITIES.put("Ó","Ó");
405 CODED_ENTITIES.put("Ô", "Ô");
406 CODED_ENTITIES.put("Õ","Õ");
407 CODED_ENTITIES.put("Ö", "Ö");
408 CODED_ENTITIES.put("×", "×");
409 CODED_ENTITIES.put("Ø","Ø");
410 CODED_ENTITIES.put("Ù","Ù");
411 CODED_ENTITIES.put("Ú","Ú");
412 CODED_ENTITIES.put("Û", "Û");
413 CODED_ENTITIES.put("Ü", "Ü");
414 CODED_ENTITIES.put("Ý","Ý");
415 CODED_ENTITIES.put("Þ", "Þ");
416 CODED_ENTITIES.put("ß", "ß");
417 CODED_ENTITIES.put("à","à");
418 CODED_ENTITIES.put("á","á");
419 CODED_ENTITIES.put("â", "â");
420 CODED_ENTITIES.put("ã","ã");
421 CODED_ENTITIES.put("ä", "ä");
422 CODED_ENTITIES.put("å", "å");
423 CODED_ENTITIES.put("æ", "æ");
424 CODED_ENTITIES.put("ç","ç");
425 CODED_ENTITIES.put("è","è");
426 CODED_ENTITIES.put("é","é");
427 CODED_ENTITIES.put("ê", "ê");
428 CODED_ENTITIES.put("ë", "ë");
429 CODED_ENTITIES.put("ì","ì");
430 CODED_ENTITIES.put("í","í");
431 CODED_ENTITIES.put("î", "î");
432 CODED_ENTITIES.put("ï", "ï");
433 CODED_ENTITIES.put("ð", "ð");
434 CODED_ENTITIES.put("ñ","ñ");
435 CODED_ENTITIES.put("ò","ò");
436 CODED_ENTITIES.put("ó","ó");
437 CODED_ENTITIES.put("ô", "ô");
438 CODED_ENTITIES.put("õ","õ");
439 CODED_ENTITIES.put("ö", "ö");
440 CODED_ENTITIES.put("÷","÷");
441 CODED_ENTITIES.put("ø","ø");
442 CODED_ENTITIES.put("ù","ù");
443 CODED_ENTITIES.put("ú","ú");
444 CODED_ENTITIES.put("û", "û");
445 CODED_ENTITIES.put("ü", "ü");
446 CODED_ENTITIES.put("ý","ý");
447 CODED_ENTITIES.put("þ", "þ");
448 CODED_ENTITIES.put("ÿ", "ÿ");
449 }
450
451
452
453
454
455 private static Pattern ENTITIES_PATTERN = Pattern.compile( "&[A-Za-z^#]+;" );
456
457
458 public String processHtmlEntities(String s) {
459 if (s.indexOf('&')==-1) {
460 return s;
461 }
462 StringBuffer sb = new StringBuffer(s.length());
463 int pos = 0;
464 while (pos<s.length()) {
465 String chunck = s.substring(pos);
466 Matcher m = ENTITIES_PATTERN.matcher(chunck);
467 if (m.find()) {
468 int b = pos + m.start();
469 int e = pos + m.end();
470 if (b>pos) {
471 sb.append(s.substring(pos,b));
472 pos = b;
473 }
474 chunck = s.substring(pos,e);
475 String codedEntity = (String) CODED_ENTITIES.get(chunck);
476 if (codedEntity==null) {
477 codedEntity = chunck;
478 }
479 sb.append(codedEntity);
480 pos = e;
481 }
482 else {
483 sb.append(chunck);
484 pos += chunck.length();
485 }
486 }
487 return sb.toString();
488 }
489
490 }