1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package com.sun.syndication.io.impl;
18
19 import java.io.IOException;
20 import java.io.Reader;
21 import java.util.HashMap;
22 import java.util.Map;
23 import java.util.regex.Pattern;
24 import java.util.regex.Matcher;
25
26 /***
27 * @author Alejandro Abdelnur
28 */
29 public class XmlFixerReader extends Reader {
30
31 protected Reader in;
32
33 public XmlFixerReader(Reader in) {
34 super(in);
35 this.in = in;
36 _buffer = new StringBuffer();
37 _state = 0;
38 }
39
40 private boolean trimmed;
41 private StringBuffer _buffer;
42 private int _bufferPos;
43 private int _state = 0;
44
45 private boolean trimStream() throws IOException {
46 boolean hasContent = true;
47 int state = 0;
48 boolean loop;
49 int c;
50 do {
51 switch (state) {
52 case 0:
53 c = in.read();
54 if (c==-1) {
55 loop = false;
56 hasContent = false;
57 }
58 else
59 if (c==' ' || c=='\n') {
60 loop = true;
61 }
62 else
63 if (c=='<') {
64 state = 1;
65 _buffer.setLength(0);
66 _bufferPos = 0;
67 _buffer.append((char)c);
68 loop = true;
69 }
70 else {
71 _buffer.setLength(0);
72 _bufferPos = 0;
73 _buffer.append((char)c);
74 loop = false;
75 hasContent = true;
76 _state = 3;
77 }
78 break;
79 case 1:
80 c = in.read();
81 if (c==-1) {
82 loop = false;
83 hasContent = true;
84 _state = 3;
85 }
86 else
87 if (c!='!') {
88 _buffer.append((char)c);
89 _state = 3;
90 loop = false;
91 hasContent = true;
92 _state = 3;
93 }
94 else {
95 _buffer.append((char)c);
96 state = 2;
97 loop = true;
98 }
99 break;
100 case 2:
101 c = in.read();
102 if (c==-1) {
103 loop = false;
104 hasContent = true;
105 _state = 3;
106 }
107 else
108 if (c=='-') {
109 _buffer.append((char)c);
110 state = 3;
111 loop = true;
112 }
113 else {
114 _buffer.append((char)c);
115 loop = false;
116 hasContent = true;
117 _state = 3;
118 }
119 break;
120 case 3:
121 c = in.read();
122 if (c==-1) {
123 loop = false;
124 hasContent = true;
125 _state = 3;
126 }
127 else
128 if (c=='-') {
129 _buffer.append((char)c);
130 state = 4;
131 loop = true;
132 }
133 else {
134 _buffer.append((char)c);
135 loop = false;
136 hasContent = true;
137 _state = 3;
138 }
139 break;
140 case 4:
141 c = in.read();
142 if (c==-1) {
143 loop = false;
144 hasContent = true;
145 _state = 3;
146 }
147 else
148 if (c!='-') {
149 _buffer.append((char)c);
150 loop = true;
151 }
152 else {
153 _buffer.append((char)c);
154 state = 5;
155 loop = true;
156 }
157 break;
158 case 5:
159 c = in.read();
160 if (c==-1) {
161 loop = false;
162 hasContent = true;
163 _state = 3;
164 }
165 else
166 if (c!='-') {
167 _buffer.append((char)c);
168 loop = true;
169 state = 4;
170 }
171 else {
172 _buffer.append((char)c);
173 state = 6;
174 loop = true;
175 }
176 break;
177 case 6:
178 c = in.read();
179 if (c==-1) {
180 loop = false;
181 hasContent = true;
182 _state = 3;
183 }
184 else
185 if (c!='>') {
186 _buffer.append((char)c);
187 loop = true;
188 state = 4;
189 }
190 else {
191 _buffer.setLength(0);
192 state = 0;
193 loop = true;
194 }
195 break;
196 default:
197 throw new IOException("It shouldn't happen");
198 }
199 } while (loop);
200 return hasContent;
201 }
202
203 public int read() throws IOException {
204 boolean loop;
205 if (!trimmed) {
206 trimmed = true;
207 if (!trimStream()) {
208 return -1;
209 }
210 }
211 int c;
212 do {
213 switch (_state) {
214 case 0:
215 c = in.read();
216 if (c>-1) {
217 if (c=='&') {
218 _state = 1;
219 _buffer.setLength(0);
220 _bufferPos = 0;
221 _buffer.append((char)c);
222 _state = 1;
223 loop = true;
224 }
225 else {
226 loop = false;
227 }
228 }
229 else {
230 loop = false;
231 }
232 break;
233 case 1:
234 c = in.read();
235 if (c>-1) {
236 if (c==';') {
237 _buffer.append((char)c);
238 _state = 2;
239 loop = true;
240 }
241 else
242 if ((c>='a' && c<='z') || (c>='A' && c<='Z') || (c=='#') || (c>='0' && c<='9')) {
243 _buffer.append((char)c);
244 loop = true;
245 }
246 else {
247
248
249 _buffer.insert(1, "amp;");
250 _buffer.append((char)c);
251 _state = 3;
252 loop = true;
253 }
254 }
255 else {
256
257
258 _buffer.insert(1, "amp;");
259 _state = 3;
260 loop = true;
261 }
262 break;
263 case 2:
264 c = 0;
265 String literalEntity = _buffer.toString();
266 String codedEntity = (String) CODED_ENTITIES.get(literalEntity);
267 if (codedEntity!=null) {
268 _buffer.setLength(0);
269 _buffer.append(codedEntity);
270 }
271 _state = 3;
272 loop = true;
273 break;
274 case 3:
275 if (_bufferPos<_buffer.length()) {
276 c = _buffer.charAt(_bufferPos++);
277 loop = false;
278 }
279 else {
280 c = 0;
281 _state = 0;
282 loop = true;
283 }
284 break;
285 default:
286 throw new IOException("It shouldn't happen");
287 }
288 } while (loop);
289 return c;
290 }
291
292 public int read(char[] buffer,int offset,int len) throws IOException {
293 int charsRead = 0;
294 int c = read();
295 if (c==-1) {
296 return -1;
297 }
298 buffer[offset+(charsRead++)] = (char) c;
299 while (charsRead<len && (c=read())>-1) {
300 buffer[offset+(charsRead++)] = (char) c;
301 }
302 return charsRead;
303 }
304
305 public long skip(long n) throws IOException {
306 if (n==0) {
307 return 0;
308 }
309 else
310 if (n<0) {
311 throw new IllegalArgumentException("'n' cannot be negative");
312 }
313 int c = read();
314 long counter = 1;
315 while (c>-1 && counter<n) {
316 c = read();
317 counter++;
318 }
319 return counter;
320 }
321
322 public boolean ready() throws IOException {
323 return (_state!=0) || in.ready();
324 }
325
326 public boolean markSupported() {
327 return false;
328 }
329
330 public void mark(int readAheadLimit) throws IOException {
331 throw new IOException("Stream does not support mark");
332 }
333
334 public void reset() throws IOException {
335 throw new IOException("Stream does not support mark");
336 }
337
338 public void close() throws IOException {
339 in.close();
340 }
341
342 private static Map CODED_ENTITIES = new HashMap();
343
344 static {
345
346
347
348
349
350
351 CODED_ENTITIES.put(" ", " ");
352 CODED_ENTITIES.put("¡", "¡");
353 CODED_ENTITIES.put("¢", "¢");
354 CODED_ENTITIES.put("£", "£");
355 CODED_ENTITIES.put("¤","¤");
356 CODED_ENTITIES.put("¥", "¥");
357 CODED_ENTITIES.put("¦","¦");
358 CODED_ENTITIES.put("§", "§");
359 CODED_ENTITIES.put("¨", "¨");
360 CODED_ENTITIES.put("©", "©");
361 CODED_ENTITIES.put("ª", "ª");
362 CODED_ENTITIES.put("«", "«");
363 CODED_ENTITIES.put("¬", "¬");
364 CODED_ENTITIES.put("­", "­");
365 CODED_ENTITIES.put("®", "®");
366 CODED_ENTITIES.put("¯", "¯");
367 CODED_ENTITIES.put("°", "°");
368 CODED_ENTITIES.put("±","±");
369 CODED_ENTITIES.put("²", "²");
370 CODED_ENTITIES.put("³", "³");
371 CODED_ENTITIES.put("´", "´");
372 CODED_ENTITIES.put("µ", "µ");
373 CODED_ENTITIES.put("¶", "¶");
374 CODED_ENTITIES.put("·","·");
375 CODED_ENTITIES.put("¸", "¸");
376 CODED_ENTITIES.put("¹", "¹");
377 CODED_ENTITIES.put("º", "º");
378 CODED_ENTITIES.put("»", "»");
379 CODED_ENTITIES.put("¼","¼");
380 CODED_ENTITIES.put("½","½");
381 CODED_ENTITIES.put("¾","¾");
382 CODED_ENTITIES.put("¿","¿");
383 CODED_ENTITIES.put("À","À");
384 CODED_ENTITIES.put("Á","Á");
385 CODED_ENTITIES.put("Â", "Â");
386 CODED_ENTITIES.put("Ã","Ã");
387 CODED_ENTITIES.put("Ä", "Ä");
388 CODED_ENTITIES.put("Å", "Å");
389 CODED_ENTITIES.put("Æ", "Æ");
390 CODED_ENTITIES.put("Ç","Ç");
391 CODED_ENTITIES.put("È","È");
392 CODED_ENTITIES.put("É","É");
393 CODED_ENTITIES.put("Ê", "Ê");
394 CODED_ENTITIES.put("Ë", "Ë");
395 CODED_ENTITIES.put("Ì","Ì");
396 CODED_ENTITIES.put("Í","Í");
397 CODED_ENTITIES.put("Î", "Î");
398 CODED_ENTITIES.put("Ï", "Ï");
399 CODED_ENTITIES.put("Ð", "Ð");
400 CODED_ENTITIES.put("Ñ","Ñ");
401 CODED_ENTITIES.put("Ò","Ò");
402 CODED_ENTITIES.put("Ó","Ó");
403 CODED_ENTITIES.put("Ô", "Ô");
404 CODED_ENTITIES.put("Õ","Õ");
405 CODED_ENTITIES.put("Ö", "Ö");
406 CODED_ENTITIES.put("×", "×");
407 CODED_ENTITIES.put("Ø","Ø");
408 CODED_ENTITIES.put("Ù","Ù");
409 CODED_ENTITIES.put("Ú","Ú");
410 CODED_ENTITIES.put("Û", "Û");
411 CODED_ENTITIES.put("Ü", "Ü");
412 CODED_ENTITIES.put("Ý","Ý");
413 CODED_ENTITIES.put("Þ", "Þ");
414 CODED_ENTITIES.put("ß", "ß");
415 CODED_ENTITIES.put("à","à");
416 CODED_ENTITIES.put("á","á");
417 CODED_ENTITIES.put("â", "â");
418 CODED_ENTITIES.put("ã","ã");
419 CODED_ENTITIES.put("ä", "ä");
420 CODED_ENTITIES.put("å", "å");
421 CODED_ENTITIES.put("æ", "æ");
422 CODED_ENTITIES.put("ç","ç");
423 CODED_ENTITIES.put("è","è");
424 CODED_ENTITIES.put("é","é");
425 CODED_ENTITIES.put("ê", "ê");
426 CODED_ENTITIES.put("ë", "ë");
427 CODED_ENTITIES.put("ì","ì");
428 CODED_ENTITIES.put("í","í");
429 CODED_ENTITIES.put("î", "î");
430 CODED_ENTITIES.put("ï", "ï");
431 CODED_ENTITIES.put("ð", "ð");
432 CODED_ENTITIES.put("ñ","ñ");
433 CODED_ENTITIES.put("ò","ò");
434 CODED_ENTITIES.put("ó","ó");
435 CODED_ENTITIES.put("ô", "ô");
436 CODED_ENTITIES.put("õ","õ");
437 CODED_ENTITIES.put("ö", "ö");
438 CODED_ENTITIES.put("÷","÷");
439 CODED_ENTITIES.put("ø","ø");
440 CODED_ENTITIES.put("ù","ù");
441 CODED_ENTITIES.put("ú","ú");
442 CODED_ENTITIES.put("û", "û");
443 CODED_ENTITIES.put("ü", "ü");
444 CODED_ENTITIES.put("ý","ý");
445 CODED_ENTITIES.put("þ", "þ");
446 CODED_ENTITIES.put("ÿ", "ÿ");
447
448
449
450
451 CODED_ENTITIES.put("ƒ", "ƒ");
452 CODED_ENTITIES.put("Α", "Α");
453 CODED_ENTITIES.put("Β", "Β");
454 CODED_ENTITIES.put("Γ", "Γ");
455 CODED_ENTITIES.put("Δ", "Δ");
456 CODED_ENTITIES.put("Ε", "Ε");
457 CODED_ENTITIES.put("Ζ", "Ζ");
458 CODED_ENTITIES.put("Η", "Η");
459 CODED_ENTITIES.put("Θ", "Θ");
460 CODED_ENTITIES.put("Ι", "Ι");
461 CODED_ENTITIES.put("Κ", "Κ");
462 CODED_ENTITIES.put("Λ", "Λ");
463 CODED_ENTITIES.put("Μ", "Μ");
464 CODED_ENTITIES.put("Ν", "Ν");
465 CODED_ENTITIES.put("Ξ", "Ξ");
466 CODED_ENTITIES.put("Ο", "Ο");
467 CODED_ENTITIES.put("Π", "Π");
468 CODED_ENTITIES.put("Ρ", "Ρ");
469 CODED_ENTITIES.put("Σ", "Σ");
470 CODED_ENTITIES.put("Τ", "Τ");
471 CODED_ENTITIES.put("Υ", "Υ");
472 CODED_ENTITIES.put("Φ", "Φ");
473 CODED_ENTITIES.put("Χ", "Χ");
474 CODED_ENTITIES.put("Ψ", "Ψ");
475 CODED_ENTITIES.put("Ω", "Ω");
476 CODED_ENTITIES.put("α", "α");
477 CODED_ENTITIES.put("β", "β");
478 CODED_ENTITIES.put("γ", "γ");
479 CODED_ENTITIES.put("δ", "δ");
480 CODED_ENTITIES.put("ε", "ε");
481 CODED_ENTITIES.put("ζ", "ζ");
482 CODED_ENTITIES.put("η", "η");
483 CODED_ENTITIES.put("θ", "θ");
484 CODED_ENTITIES.put("ι", "ι");
485 CODED_ENTITIES.put("κ", "κ");
486 CODED_ENTITIES.put("λ", "λ");
487 CODED_ENTITIES.put("μ", "μ");
488 CODED_ENTITIES.put("ν", "ν");
489 CODED_ENTITIES.put("ξ", "ξ");
490 CODED_ENTITIES.put("ο", "ο");
491 CODED_ENTITIES.put("π", "π");
492 CODED_ENTITIES.put("ρ", "ρ");
493 CODED_ENTITIES.put("ς", "ς");
494 CODED_ENTITIES.put("σ", "σ");
495 CODED_ENTITIES.put("τ", "τ");
496 CODED_ENTITIES.put("υ", "υ");
497 CODED_ENTITIES.put("φ", "φ");
498 CODED_ENTITIES.put("χ", "χ");
499 CODED_ENTITIES.put("ψ", "ψ");
500 CODED_ENTITIES.put("ω", "ω");
501 CODED_ENTITIES.put("ϑ", "ϑ");
502 CODED_ENTITIES.put("ϒ", "ϒ");
503 CODED_ENTITIES.put("ϖ", "ϖ");
504 CODED_ENTITIES.put("•", "•");
505 CODED_ENTITIES.put("…", "…");
506 CODED_ENTITIES.put("′", "′");
507 CODED_ENTITIES.put("″", "″");
508 CODED_ENTITIES.put("‾", "‾");
509 CODED_ENTITIES.put("⁄", "⁄");
510 CODED_ENTITIES.put("℘", "℘");
511 CODED_ENTITIES.put("ℑ", "ℑ");
512 CODED_ENTITIES.put("ℜ", "ℜ");
513 CODED_ENTITIES.put("™", "™");
514 CODED_ENTITIES.put("ℵ", "ℵ");
515 CODED_ENTITIES.put("←", "←");
516 CODED_ENTITIES.put("↑", "↑");
517 CODED_ENTITIES.put("→", "→");
518 CODED_ENTITIES.put("↓", "↓");
519 CODED_ENTITIES.put("↔", "↔");
520 CODED_ENTITIES.put("↵", "↵");
521 CODED_ENTITIES.put("⇐", "⇐");
522 CODED_ENTITIES.put("⇑", "⇑");
523 CODED_ENTITIES.put("⇒", "⇒");
524 CODED_ENTITIES.put("⇓", "⇓");
525 CODED_ENTITIES.put("⇔", "⇔");
526 CODED_ENTITIES.put("∀", "∀");
527 CODED_ENTITIES.put("∂", "∂");
528 CODED_ENTITIES.put("∃", "∃");
529 CODED_ENTITIES.put("∅", "∅");
530 CODED_ENTITIES.put("∇", "∇");
531 CODED_ENTITIES.put("∈", "∈");
532 CODED_ENTITIES.put("∉", "∉");
533 CODED_ENTITIES.put("∋", "∋");
534 CODED_ENTITIES.put("∏", "∏");
535 CODED_ENTITIES.put("∑", "∑");
536 CODED_ENTITIES.put("−", "−");
537 CODED_ENTITIES.put("∗", "∗");
538 CODED_ENTITIES.put("√", "√");
539 CODED_ENTITIES.put("∝", "∝");
540 CODED_ENTITIES.put("∞", "∞");
541 CODED_ENTITIES.put("∠", "∠");
542 CODED_ENTITIES.put("∧", "∧");
543 CODED_ENTITIES.put("∨", "∨");
544 CODED_ENTITIES.put("∩", "∩");
545 CODED_ENTITIES.put("∪", "∪");
546 CODED_ENTITIES.put("∫", "∫");
547 CODED_ENTITIES.put("∴", "∴");
548 CODED_ENTITIES.put("∼", "∼");
549 CODED_ENTITIES.put("≅", "≅");
550 CODED_ENTITIES.put("≈", "≈");
551 CODED_ENTITIES.put("≠", "≠");
552 CODED_ENTITIES.put("≡", "≡");
553 CODED_ENTITIES.put("≤", "≤");
554 CODED_ENTITIES.put("≥", "≥");
555 CODED_ENTITIES.put("⊂", "⊂");
556 CODED_ENTITIES.put("⊃", "⊃");
557 CODED_ENTITIES.put("⊄", "⊄");
558 CODED_ENTITIES.put("⊆", "⊆");
559 CODED_ENTITIES.put("⊇", "⊇");
560 CODED_ENTITIES.put("⊕", "⊕");
561 CODED_ENTITIES.put("⊗", "⊗");
562 CODED_ENTITIES.put("⊥", "⊥");
563 CODED_ENTITIES.put("⋅", "⋅");
564 CODED_ENTITIES.put("⌈", "⌈");
565 CODED_ENTITIES.put("⌉", "⌉");
566 CODED_ENTITIES.put("⌊", "⌊");
567 CODED_ENTITIES.put("⌋", "⌋");
568 CODED_ENTITIES.put("⟨", "〈");
569 CODED_ENTITIES.put("⟩", "〉");
570 CODED_ENTITIES.put("◊", "◊");
571 CODED_ENTITIES.put("♠", "♠");
572 CODED_ENTITIES.put("♣", "♣");
573 CODED_ENTITIES.put("♥", "♥");
574 CODED_ENTITIES.put("♦", "♦");
575
576
577
578
579 CODED_ENTITIES.put(""", """);
580 CODED_ENTITIES.put("&", "&");
581 CODED_ENTITIES.put("<", "<");
582 CODED_ENTITIES.put(">", ">");
583 CODED_ENTITIES.put("Œ", "Œ");
584 CODED_ENTITIES.put("œ", "œ");
585 CODED_ENTITIES.put("Š", "Š");
586 CODED_ENTITIES.put("š", "š");
587 CODED_ENTITIES.put("Ÿ", "Ÿ");
588 CODED_ENTITIES.put("ˆ", "ˆ");
589 CODED_ENTITIES.put("˜", "˜");
590 CODED_ENTITIES.put(" ", " ");
591 CODED_ENTITIES.put(" ", " ");
592 CODED_ENTITIES.put(" ", " ");
593 CODED_ENTITIES.put("‌", "‌");
594 CODED_ENTITIES.put("‍", "‍");
595 CODED_ENTITIES.put("‎", "‎");
596 CODED_ENTITIES.put("‏", "‏");
597 CODED_ENTITIES.put("–", "–");
598 CODED_ENTITIES.put("—", "—");
599 CODED_ENTITIES.put("‘", "‘");
600 CODED_ENTITIES.put("’", "’");
601 CODED_ENTITIES.put("‚", "‚");
602 CODED_ENTITIES.put("“", "“");
603 CODED_ENTITIES.put("”", "”");
604 CODED_ENTITIES.put("„", "„");
605 CODED_ENTITIES.put("†", "†");
606 CODED_ENTITIES.put("‡", "‡");
607 CODED_ENTITIES.put("‰", "‰");
608 CODED_ENTITIES.put("‹", "‹");
609 CODED_ENTITIES.put("›", "›");
610 CODED_ENTITIES.put("€", "€");
611 }
612
613
614
615
616
617 private static Pattern ENTITIES_PATTERN = Pattern.compile( "&[A-Za-z^#]+;" );
618
619
620 public String processHtmlEntities(String s) {
621 if (s.indexOf('&')==-1) {
622 return s;
623 }
624 StringBuffer sb = new StringBuffer(s.length());
625 int pos = 0;
626 while (pos<s.length()) {
627 String chunck = s.substring(pos);
628 Matcher m = ENTITIES_PATTERN.matcher(chunck);
629 if (m.find()) {
630 int b = pos + m.start();
631 int e = pos + m.end();
632 if (b>pos) {
633 sb.append(s.substring(pos,b));
634 pos = b;
635 }
636 chunck = s.substring(pos,e);
637 String codedEntity = (String) CODED_ENTITIES.get(chunck);
638 if (codedEntity==null) {
639 codedEntity = chunck;
640 }
641 sb.append(codedEntity);
642 pos = e;
643 }
644 else {
645 sb.append(chunck);
646 pos += chunck.length();
647 }
648 }
649 return sb.toString();
650 }
651
652 }