1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package com.sun.syndication.io.impl;
18
19 import java.util.ArrayList;
20 import java.util.Iterator;
21 import java.util.List;
22
23 import org.jdom.Document;
24 import org.jdom.Element;
25 import org.jdom.Namespace;
26 import org.jdom.output.XMLOutputter;
27
28 import com.sun.syndication.feed.WireFeed;
29 import com.sun.syndication.feed.atom.Category;
30 import com.sun.syndication.feed.atom.Content;
31 import com.sun.syndication.feed.atom.Entry;
32 import com.sun.syndication.feed.atom.Feed;
33 import com.sun.syndication.feed.atom.Generator;
34 import com.sun.syndication.feed.atom.Link;
35 import com.sun.syndication.feed.atom.Person;
36 import com.sun.syndication.io.FeedException;
37 import java.net.MalformedURLException;
38 import java.net.URL;
39 import java.util.regex.Pattern;
40 import org.jdom.Attribute;
41 import org.jdom.Parent;
42
43 /***
44 * Parser for Atom 1.0
45 * @author Dave Johnson
46 */
47 public class Atom10Parser extends BaseWireFeedParser {
48 private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
49 Namespace ns = Namespace.getNamespace(ATOM_10_URI);
50
51 public Atom10Parser() {
52 this("atom_1.0");
53 }
54
55 protected Atom10Parser(String type) {
56 super(type);
57 }
58
59 protected Namespace getAtomNamespace() {
60 return ns;
61 }
62
63 public boolean isMyType(Document document) {
64 Element rssRoot = document.getRootElement();
65 Namespace defaultNS = rssRoot.getNamespace();
66 return (defaultNS!=null) && defaultNS.equals(getAtomNamespace());
67 }
68
69 public WireFeed parse(Document document, boolean validate)
70 throws IllegalArgumentException,FeedException {
71 if (validate) {
72 validateFeed(document);
73 }
74 Element rssRoot = document.getRootElement();
75 return parseFeed(rssRoot);
76 }
77
78 protected void validateFeed(Document document) throws FeedException {
79
80
81
82
83
84
85 }
86
87 protected WireFeed parseFeed(Element eFeed) throws FeedException {
88
89 com.sun.syndication.feed.atom.Feed feed =
90 new com.sun.syndication.feed.atom.Feed(getType());
91
92 String baseURI = null;
93 try {
94 baseURI = findBaseURI(eFeed);
95 } catch (Exception e) {
96 throw new FeedException("ERROR while finding base URI of feed", e);
97 }
98
99 String xmlBase = eFeed.getAttributeValue("base", Namespace.XML_NAMESPACE);
100 if (xmlBase != null) {
101 feed.setXmlBase(xmlBase);
102 }
103
104 Element e = eFeed.getChild("title",getAtomNamespace());
105 if (e!=null) {
106 Content c = new Content();
107 c.setValue(parseTextConstructToString(e));
108 c.setType(e.getAttributeValue("type"));
109 feed.setTitleEx(c);
110 }
111
112 List eList = eFeed.getChildren("link",getAtomNamespace());
113 feed.setAlternateLinks(parseAlternateLinks(feed, null, baseURI, eList));
114 feed.setOtherLinks(parseOtherLinks(feed, null, baseURI, eList));
115
116 List cList = eFeed.getChildren("category",getAtomNamespace());
117 feed.setCategories(parseCategories(baseURI, cList));
118
119 eList = eFeed.getChildren("author", getAtomNamespace());
120 if (eList.size()>0) {
121 feed.setAuthors(parsePersons(baseURI, eList));
122 }
123
124 eList = eFeed.getChildren("contributor",getAtomNamespace());
125 if (eList.size()>0) {
126 feed.setContributors(parsePersons(baseURI, eList));
127 }
128
129 e = eFeed.getChild("subtitle",getAtomNamespace());
130 if (e!=null) {
131 Content subtitle = new Content();
132 subtitle.setValue(parseTextConstructToString(e));
133 subtitle.setType(e.getAttributeValue("type"));
134 feed.setSubtitle(subtitle);
135 }
136
137 e = eFeed.getChild("id",getAtomNamespace());
138 if (e!=null) {
139 feed.setId(e.getText());
140 }
141
142 e = eFeed.getChild("generator",getAtomNamespace());
143 if (e!=null) {
144 Generator gen = new Generator();
145 gen.setValue(e.getText());
146 String att = e.getAttributeValue("uri");
147 if (att!=null) {
148 gen.setUrl(att);
149 }
150 att = e.getAttributeValue("version");
151 if (att!=null) {
152 gen.setVersion(att);
153 }
154 feed.setGenerator(gen);
155 }
156
157 e = eFeed.getChild("rights",getAtomNamespace());
158 if (e!=null) {
159 feed.setRights(parseTextConstructToString(e));
160 }
161
162 e = eFeed.getChild("icon",getAtomNamespace());
163 if (e!=null) {
164 feed.setIcon(e.getText());
165 }
166
167 e = eFeed.getChild("logo",getAtomNamespace());
168 if (e!=null) {
169 feed.setLogo(e.getText());
170 }
171
172 e = eFeed.getChild("updated",getAtomNamespace());
173 if (e!=null) {
174 feed.setUpdated(DateParser.parseDate(e.getText()));
175 }
176
177 feed.setModules(parseFeedModules(eFeed));
178
179 eList = eFeed.getChildren("entry",getAtomNamespace());
180 if (eList.size()>0) {
181 feed.setEntries(parseEntries(feed, baseURI, eList));
182 }
183
184 List foreignMarkup =
185 extractForeignMarkup(eFeed, feed, getAtomNamespace());
186 if (foreignMarkup.size() > 0) {
187 feed.setForeignMarkup(foreignMarkup);
188 }
189 return feed;
190 }
191
192 private Link parseLink(Feed feed , Entry entry, String baseURI, Element eLink) {
193 Link link = new Link();
194 String att = eLink.getAttributeValue("rel");
195 if (att!=null) {
196 link.setRel(att);
197 }
198 att = eLink.getAttributeValue("type");
199 if (att!=null) {
200 link.setType(att);
201 }
202 att = eLink.getAttributeValue("href");
203 if (att!=null) {
204 if (isRelativeURI(att)) {
205 link.setHref(resolveURI(baseURI, eLink, att));
206 } else {
207 link.setHref(att);
208 }
209 }
210 att = eLink.getAttributeValue("title");
211 if (att!=null) {
212 link.setTitle(att);
213 }
214 att = eLink.getAttributeValue("hreflang");
215 if (att!=null) {
216 link.setHreflang(att);
217 }
218 att = eLink.getAttributeValue("length");
219 if (att!=null) {
220 link.setLength(Long.parseLong(att));
221 }
222 return link;
223 }
224
225
226 private List parseAlternateLinks(Feed feed, Entry entry, String baseURI, List eLinks) {
227 List links = new ArrayList();
228 for (int i=0;i<eLinks.size();i++) {
229 Element eLink = (Element) eLinks.get(i);
230 Link link = parseLink(feed, entry, baseURI, eLink);
231 if (link.getRel() == null
232 || "".equals(link.getRel().trim())
233 || "alternate".equals(link.getRel())) {
234 links.add(link);
235 }
236 }
237 return (links.size()>0) ? links : null;
238 }
239
240 private List parseOtherLinks(Feed feed, Entry entry, String baseURI, List eLinks) {
241 List links = new ArrayList();
242 for (int i=0;i<eLinks.size();i++) {
243 Element eLink = (Element) eLinks.get(i);
244 Link link = parseLink(feed, entry, baseURI, eLink);
245 if (!"alternate".equals(link.getRel())) {
246 links.add(link);
247 }
248 }
249 return (links.size()>0) ? links : null;
250 }
251
252 private Person parsePerson(String baseURI, Element ePerson) {
253 Person person = new Person();
254 Element e = ePerson.getChild("name",getAtomNamespace());
255 if (e!=null) {
256 person.setName(e.getText());
257 }
258 e = ePerson.getChild("uri",getAtomNamespace());
259 if (e!=null) {
260 person.setUri(resolveURI(baseURI, ePerson, e.getText()));
261 }
262 e = ePerson.getChild("email",getAtomNamespace());
263 if (e!=null) {
264 person.setEmail(e.getText());
265 }
266 return person;
267 }
268
269
270 private List parsePersons(String baseURI, List ePersons) {
271 List persons = new ArrayList();
272 for (int i=0;i<ePersons.size();i++) {
273 persons.add(parsePerson(baseURI, (Element)ePersons.get(i)));
274 }
275 return (persons.size()>0) ? persons : null;
276 }
277
278 private Content parseContent(Element e) {
279 String value = parseTextConstructToString(e);
280 String src = e.getAttributeValue("src");
281 String type = e.getAttributeValue("type");
282 Content content = new Content();
283 content.setSrc(src);
284 content.setType(type);
285 content.setValue(value);
286 return content;
287 }
288
289 private String parseTextConstructToString(Element e) {
290 String value = null;
291 String type = e.getAttributeValue("type");
292 type = (type!=null) ? type : Content.TEXT;
293 if (type.equals(Content.XHTML)) {
294
295 XMLOutputter outputter = new XMLOutputter();
296 List eContent = e.getContent();
297 Iterator i = eContent.iterator();
298 while (i.hasNext()) {
299 org.jdom.Content c = (org.jdom.Content) i.next();
300 if (c instanceof Element) {
301 Element eC = (Element) c;
302 if (eC.getNamespace().equals(getAtomNamespace())) {
303 ((Element)c).setNamespace(Namespace.NO_NAMESPACE);
304 }
305 }
306 }
307 value = outputter.outputString(eContent);
308 } else {
309
310 value = e.getText();
311 }
312 return value;
313 }
314
315
316 protected List parseEntries(Feed feed, String baseURI, List eEntries) {
317 List entries = new ArrayList();
318 for (int i=0;i<eEntries.size();i++) {
319 entries.add(parseEntry(feed, (Element)eEntries.get(i), baseURI));
320 }
321 return (entries.size()>0) ? entries : null;
322 }
323
324 protected Entry parseEntry(Feed feed, Element eEntry, String baseURI) {
325 Entry entry = new Entry();
326
327 String xmlBase = eEntry.getAttributeValue("base", Namespace.XML_NAMESPACE);
328 if (xmlBase != null) {
329 entry.setXmlBase(xmlBase);
330 }
331
332 Element e = eEntry.getChild("title",getAtomNamespace());
333 if (e!=null) {
334 Content c = new Content();
335 c.setValue(parseTextConstructToString(e));
336 c.setType(e.getAttributeValue("type"));
337 entry.setTitleEx(c);
338 }
339
340 List eList = eEntry.getChildren("link",getAtomNamespace());
341 entry.setAlternateLinks(parseAlternateLinks(feed, entry, baseURI, eList));
342 entry.setOtherLinks(parseOtherLinks(feed, entry, baseURI, eList));
343
344 eList = eEntry.getChildren("author", getAtomNamespace());
345 if (eList.size()>0) {
346 entry.setAuthors(parsePersons(baseURI, eList));
347 }
348
349 eList = eEntry.getChildren("contributor",getAtomNamespace());
350 if (eList.size()>0) {
351 entry.setContributors(parsePersons(baseURI, eList));
352 }
353
354 e = eEntry.getChild("id",getAtomNamespace());
355 if (e!=null) {
356 entry.setId(e.getText());
357 }
358
359 e = eEntry.getChild("updated",getAtomNamespace());
360 if (e!=null) {
361 entry.setUpdated(DateParser.parseW3CDateTime(e.getText()));
362 }
363
364 e = eEntry.getChild("published",getAtomNamespace());
365 if (e!=null) {
366 entry.setPublished(DateParser.parseW3CDateTime(e.getText()));
367 }
368
369 e = eEntry.getChild("summary",getAtomNamespace());
370 if (e!=null) {
371 entry.setSummary(parseContent(e));
372 }
373
374 e = eEntry.getChild("content",getAtomNamespace());
375 if (e!=null) {
376 List contents = new ArrayList();
377 contents.add(parseContent(e));
378 entry.setContents(contents);
379 }
380
381 e = eEntry.getChild("rights",getAtomNamespace());
382 if (e!=null) {
383 entry.setRights(e.getText());
384 }
385
386 List cList = eEntry.getChildren("category",getAtomNamespace());
387 entry.setCategories(parseCategories(baseURI, cList));
388
389
390
391 entry.setModules(parseItemModules(eEntry));
392
393 List foreignMarkup =
394 extractForeignMarkup(eEntry, entry, getAtomNamespace());
395 if (foreignMarkup.size() > 0) {
396 entry.setForeignMarkup(foreignMarkup);
397 }
398 return entry;
399 }
400
401 private List parseCategories(String baseURI, List eCategories) {
402 List cats = new ArrayList();
403 for (int i=0;i<eCategories.size();i++) {
404 Element eCategory = (Element) eCategories.get(i);
405 cats.add(parseCategory(baseURI, eCategory));
406 }
407 return (cats.size()>0) ? cats : null;
408 }
409
410 private Category parseCategory(String baseURI, Element eCategory) {
411 Category category = new Category();
412 String att = eCategory.getAttributeValue("term");
413 if (att!=null) {
414 category.setTerm(att);
415 }
416 att = eCategory.getAttributeValue("scheme");
417 if (att!=null) {
418 category.setScheme(resolveURI(baseURI, eCategory, att));
419 }
420 att = eCategory.getAttributeValue("label");
421 if (att!=null) {
422 category.setLabel(att);
423 }
424 return category;
425
426 }
427
428
429
430
431
432
433 static Pattern absoluteURIPattern = Pattern.compile("^[a-z0-9]*:.*$");
434
435 private boolean isAbsoluteURI(String uri) {
436 return absoluteURIPattern.matcher(uri).find();
437 }
438
439 private boolean isRelativeURI(String uri) {
440 return !isAbsoluteURI(uri);
441 }
442
443 /***
444 * }
445 * Resolve URI based considering xml:base and baseURI.
446 * @param baseURI Base URI of feed
447 * @param parent Parent from which to consider xml:base
448 * @param url URL to be resolved
449 */
450 private String resolveURI(String baseURI, Parent parent, String url) {
451 if (isRelativeURI(url)) {
452 url = (!".".equals(url) && !"./".equals(url)) ? url : "";
453
454
455 if (parent != null && parent instanceof Element) {
456
457
458 String xmlbase = ((Element)parent).getAttributeValue(
459 "base", Namespace.XML_NAMESPACE);
460 if (xmlbase != null && xmlbase.trim().length() > 0) {
461 if (isAbsoluteURI(xmlbase)) {
462
463 if (url.startsWith("/")) {
464
465 int slashslash = xmlbase.indexOf("//");
466 int nextslash = xmlbase.indexOf("/", slashslash + 2);
467 if (nextslash != -1) xmlbase = xmlbase.substring(0, nextslash);
468 return formURI(xmlbase, url);
469 }
470 if (!xmlbase.endsWith("/")) {
471
472 xmlbase = xmlbase.substring(0, xmlbase.lastIndexOf("/"));
473 }
474 return formURI(xmlbase, url);
475 } else {
476
477 return resolveURI(baseURI, parent.getParent(),
478 stripTrailingSlash(xmlbase) + "/"+ stripStartingSlash(url));
479 }
480 }
481
482 return resolveURI(baseURI, parent.getParent(), url);
483
484
485 } else if (parent == null || parent instanceof Document) {
486 return formURI(baseURI, url);
487 }
488 }
489 return url;
490 }
491
492 /***
493 * Find base URI of feed considering relative URIs.
494 * @param root Root element of feed.
495 */
496 private String findBaseURI(Element root) throws MalformedURLException {
497 String ret = findAtomLink(root, "alternate");
498 if (ret != null && isRelativeURI(ret)) {
499 String self = findAtomLink(root, "self");
500 if (self != null) {
501 self = resolveURI(null, root, self);
502 self = self.substring(0, self.lastIndexOf("/"));
503 ret = resolveURI(self, root, ret);
504 }
505 }
506 return ret;
507 }
508
509 /***
510 * Return URL string of Atom link element under parent element.
511 * Link with no rel attribute is considered to be rel="alternate"
512 * @param parent Consider only children of this parent element
513 * @param rel Consider only links with this relationship
514 */
515 private String findAtomLink(Element parent, String rel) {
516 String ret = null;
517 List linksList = parent.getChildren("link", ns);
518 if (linksList != null) {
519 for (Iterator links = linksList.iterator(); links.hasNext(); ) {
520 Element link = (Element)links.next();
521 Attribute relAtt = link.getAttribute("rel");
522 Attribute hrefAtt = link.getAttribute("href");
523 if ( (relAtt == null && "alternate".equals(rel))
524 || (relAtt != null && relAtt.getValue().equals(rel))) {
525 ret = hrefAtt.getValue();
526 break;
527 }
528 }
529 }
530 return ret;
531 }
532
533 /***
534 * Form URI by combining base with append portion and giving
535 * special consideration to append portions that begin with ".."
536 * @param base Base of URI, may end with trailing slash
537 * @param append String to append, may begin with slash or ".."
538 */
539 private static String formURI(String base, String append) {
540 base = stripTrailingSlash(base);
541 append = stripStartingSlash(append);
542 if (append.startsWith("..")) {
543 String ret = null;
544 String[] parts = append.split("/");
545 for (int i=0; i<parts.length; i++) {
546 if ("..".equals(parts[i])) {
547 int last = base.lastIndexOf("/");
548 if (last != -1) {
549 base = base.substring(0, last);
550 append = append.substring(3, append.length());
551 }
552 else break;
553 }
554 }
555 }
556 return base + "/" + append;
557 }
558
559 /***
560 * Strip starting slash from beginning of string.
561 */
562 private static String stripStartingSlash(String s) {
563 if (s != null && s.startsWith("/")) {
564 s = s.substring(1, s.length());
565 }
566 return s;
567 }
568
569 /***
570 * Strip trailing slash from end of string.
571 */
572 private static String stripTrailingSlash(String s) {
573 if (s != null && s.endsWith("/")) {
574 s = s.substring(0, s.length() - 1);
575 }
576 return s;
577 }
578 }