View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package com.sun.syndication.io.impl;
18  
19  import java.util.ArrayList;
20  import java.util.Iterator;
21  import java.util.List;
22  
23  import org.jdom.Document; 
24  import org.jdom.Element;
25  import org.jdom.Namespace;
26  import org.jdom.output.XMLOutputter; 
27  
28  import com.sun.syndication.feed.WireFeed;
29  import com.sun.syndication.feed.atom.Category;
30  import com.sun.syndication.feed.atom.Content;
31  import com.sun.syndication.feed.atom.Entry;
32  import com.sun.syndication.feed.atom.Feed;
33  import com.sun.syndication.feed.atom.Generator;
34  import com.sun.syndication.feed.atom.Link;
35  import com.sun.syndication.feed.atom.Person;
36  import com.sun.syndication.io.FeedException;
37  import com.sun.syndication.io.WireFeedInput;
38  import com.sun.syndication.io.WireFeedOutput;
39  import java.io.IOException;
40  import java.io.Reader;
41  import java.net.MalformedURLException;
42  import java.util.regex.Pattern;
43  import org.jdom.Attribute;
44  import org.jdom.JDOMException;
45  import org.jdom.Parent;
46  import org.jdom.input.SAXBuilder;
47  
48  /***
49   * Parser for Atom 1.0
50   * @author Dave Johnson
51   */
52  public class Atom10Parser extends BaseWireFeedParser {
53      private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
54      private static final Namespace ATOM_10_NS = Namespace.getNamespace(ATOM_10_URI);
55  
56      private static boolean resolveURIs = false;
57  
58      public static void setResolveURIs(boolean resolveURIs) {
59          Atom10Parser.resolveURIs = resolveURIs;
60      }
61  
62      public static boolean getResolveURIs() {
63          return resolveURIs;
64      }
65  
66      public Atom10Parser() {
67          this("atom_1.0");
68      }
69      
70      protected Atom10Parser(String type) {
71          super(type, ATOM_10_NS);
72      }
73      
74      protected Namespace getAtomNamespace() {
75          return ATOM_10_NS;
76      }
77      
78      public boolean isMyType(Document document) {
79          Element rssRoot = document.getRootElement();
80          Namespace defaultNS = rssRoot.getNamespace();
81          return (defaultNS!=null) && defaultNS.equals(getAtomNamespace());
82      }
83      
84      public WireFeed parse(Document document, boolean validate)
85      throws IllegalArgumentException,FeedException {
86          if (validate) {
87              validateFeed(document);
88          }
89          Element rssRoot = document.getRootElement();
90          return parseFeed(rssRoot);
91      }
92      
93      protected void validateFeed(Document document) throws FeedException {
94          // TBD
95          // here we have to validate the Feed against a schema or whatever
96          // not sure how to do it
97          // one posibility would be to produce an ouput and attempt to parse it again
98          // with validation turned on.
99          // otherwise will have to check the document elements by hand.
100     }
101     
102     protected WireFeed parseFeed(Element eFeed) throws FeedException {
103         
104         String baseURI = null;
105         try {
106             baseURI = findBaseURI(eFeed);
107         } catch (Exception e) {
108             throw new FeedException("ERROR while finding base URI of feed", e);
109         }
110         
111         Feed feed = parseFeedMetadata(baseURI, eFeed);
112 
113         String xmlBase = eFeed.getAttributeValue("base", Namespace.XML_NAMESPACE);
114         if (xmlBase != null) {
115             feed.setXmlBase(xmlBase);
116         }
117         
118         feed.setModules(parseFeedModules(eFeed));
119 
120         List eList = eFeed.getChildren("entry",getAtomNamespace());
121         if (eList.size()>0) {
122             feed.setEntries(parseEntries(feed, baseURI, eList));
123         }
124 
125         List foreignMarkup =
126             extractForeignMarkup(eFeed, feed, getAtomNamespace());
127         if (foreignMarkup.size() > 0) {
128             feed.setForeignMarkup(foreignMarkup);
129         }
130         return feed;
131     }
132 
133     private Feed parseFeedMetadata(String baseURI, Element eFeed) {
134         com.sun.syndication.feed.atom.Feed feed =
135             new com.sun.syndication.feed.atom.Feed(getType());
136 
137         Element e = eFeed.getChild("title",getAtomNamespace());
138         if (e!=null) {
139             Content c = new Content();
140             c.setValue(parseTextConstructToString(e));
141             c.setType(getAttributeValue(e, "type"));
142             feed.setTitleEx(c);
143         }
144         
145         List eList = eFeed.getChildren("link",getAtomNamespace());
146         feed.setAlternateLinks(parseAlternateLinks(feed, null, baseURI, eList));
147         feed.setOtherLinks(parseOtherLinks(feed, null, baseURI, eList));
148         
149         List cList = eFeed.getChildren("category",getAtomNamespace());
150         feed.setCategories(parseCategories(baseURI, cList));
151         
152         eList = eFeed.getChildren("author", getAtomNamespace());
153         if (eList.size()>0) {
154             feed.setAuthors(parsePersons(baseURI, eList));
155         }
156         
157         eList = eFeed.getChildren("contributor",getAtomNamespace());
158         if (eList.size()>0) {
159             feed.setContributors(parsePersons(baseURI, eList));
160         }
161         
162         e = eFeed.getChild("subtitle",getAtomNamespace());
163         if (e!=null) {
164             Content subtitle = new Content();
165             subtitle.setValue(parseTextConstructToString(e));
166             subtitle.setType(getAttributeValue(e, "type"));
167             feed.setSubtitle(subtitle);
168         }
169         
170         e = eFeed.getChild("id",getAtomNamespace());
171         if (e!=null) {
172             feed.setId(e.getText());
173         }
174         
175         e = eFeed.getChild("generator",getAtomNamespace());
176         if (e!=null) {
177             Generator gen = new Generator();
178             gen.setValue(e.getText());
179             String att = getAttributeValue(e, "uri");
180             if (att!=null) {
181                 gen.setUrl(att);
182             }
183             att = getAttributeValue(e, "version");
184             if (att!=null) {
185                 gen.setVersion(att);
186             }
187             feed.setGenerator(gen);
188         }
189         
190         e = eFeed.getChild("rights",getAtomNamespace());
191         if (e!=null) {
192             feed.setRights(parseTextConstructToString(e));
193         }
194         
195         e = eFeed.getChild("icon",getAtomNamespace());
196         if (e!=null) {
197             feed.setIcon(e.getText());
198         }
199         
200         e = eFeed.getChild("logo",getAtomNamespace());
201         if (e!=null) {
202             feed.setLogo(e.getText());
203         }
204         
205         e = eFeed.getChild("updated",getAtomNamespace());
206         if (e!=null) {
207             feed.setUpdated(DateParser.parseDate(e.getText()));
208         }
209         
210         return feed;
211     }
212 
213     private Link parseLink(Feed feed , Entry entry, String baseURI, Element eLink) {
214         Link link = new Link();
215         String att = getAttributeValue(eLink, "rel");
216         if (att!=null) {
217             link.setRel(att);
218         }
219         att = getAttributeValue(eLink, "type");
220         if (att!=null) {
221             link.setType(att);
222         }
223         att = getAttributeValue(eLink, "href");
224         if (att!=null) {
225             link.setHref(att);
226             if (isRelativeURI(att)) {
227                 link.setHrefResolved(resolveURI(baseURI, eLink, att));
228             } 
229         }
230         att = getAttributeValue(eLink, "title");
231         if (att!=null) {
232             link.setTitle(att);
233         }
234         att = getAttributeValue(eLink, "hreflang");
235         if (att!=null) {
236             link.setHreflang(att);
237         }
238         att = getAttributeValue(eLink, "length");
239         if (att!=null) {
240             link.setLength(Long.parseLong(att));
241         }
242         return link;
243     }
244     
245     // List(Elements) -> List(Link)
246     private List parseAlternateLinks(Feed feed, Entry entry, String baseURI, List eLinks) {
247         List links = new ArrayList();
248         for (int i=0;i<eLinks.size();i++) {
249             Element eLink = (Element) eLinks.get(i);
250             Link link = parseLink(feed, entry, baseURI, eLink);
251             if (link.getRel() == null
252                     || "".equals(link.getRel().trim())
253                     || "alternate".equals(link.getRel())) {
254                 links.add(link);
255             }
256         }
257         return (links.size()>0) ? links : null;
258     }
259     
260     private List parseOtherLinks(Feed feed, Entry entry, String baseURI, List eLinks) {
261         List links = new ArrayList();
262         for (int i=0;i<eLinks.size();i++) {
263             Element eLink = (Element) eLinks.get(i);
264             Link link = parseLink(feed, entry, baseURI, eLink);
265             if (!"alternate".equals(link.getRel())) {
266                 links.add(link);
267             }
268         }
269         return (links.size()>0) ? links : null;
270     }
271     
272     private Person parsePerson(String baseURI, Element ePerson) {
273         Person person = new Person();
274         Element e = ePerson.getChild("name",getAtomNamespace());
275         if (e!=null) {
276             person.setName(e.getText());
277         }
278         e = ePerson.getChild("uri",getAtomNamespace());
279         if (e!=null) {
280             person.setUri(e.getText());
281             if (isRelativeURI(e.getText())) {
282                person.setUriResolved(resolveURI(baseURI, ePerson, e.getText())); 
283             }
284         }
285         e = ePerson.getChild("email",getAtomNamespace());
286         if (e!=null) {
287             person.setEmail(e.getText());
288         }
289         return person;
290     }
291     
292     // List(Elements) -> List(Persons)
293     private List parsePersons(String baseURI, List ePersons) {
294         List persons = new ArrayList();
295         for (int i=0;i<ePersons.size();i++) {
296             persons.add(parsePerson(baseURI, (Element)ePersons.get(i)));
297         }
298         return (persons.size()>0) ? persons : null;
299     }
300     
301     private Content parseContent(Element e) {
302         String value = parseTextConstructToString(e);
303         String src = getAttributeValue(e, "src");
304         String type = getAttributeValue(e, "type");
305         Content content = new Content();
306         content.setSrc(src);
307         content.setType(type);
308         content.setValue(value);
309         return content;
310     }
311     
312     private String parseTextConstructToString(Element e) {
313         String value = null;
314         String type = getAttributeValue(e, "type");
315         type = (type!=null) ? type : Content.TEXT;
316         if (type.equals(Content.XHTML) || (type.indexOf("/xml")) != -1 || (type.indexOf("+xml")) != -1) {
317             // XHTML content needs special handling
318             XMLOutputter outputter = new XMLOutputter();
319             List eContent = e.getContent();
320             Iterator i = eContent.iterator();
321             while (i.hasNext()) {
322                 org.jdom.Content c = (org.jdom.Content) i.next();
323                 if (c instanceof Element) {
324                     Element eC = (Element) c;
325                     if (eC.getNamespace().equals(getAtomNamespace())) {
326                         ((Element)c).setNamespace(Namespace.NO_NAMESPACE);
327                     }
328                 }
329             }
330             value = outputter.outputString(eContent);
331         } else {
332             // Everything else comes in verbatim
333             value = e.getText();
334         }
335         return value;
336     }
337     
338     // List(Elements) -> List(Entries)
339     protected List parseEntries(Feed feed, String baseURI, List eEntries) {
340         List entries = new ArrayList();
341         for (int i=0;i<eEntries.size();i++) {
342             entries.add(parseEntry(feed, (Element)eEntries.get(i), baseURI));
343         }
344         return (entries.size()>0) ? entries : null;
345     }
346     
347     protected Entry parseEntry(Feed feed, Element eEntry, String baseURI) {
348         Entry entry = new Entry();
349         
350         String xmlBase = eEntry.getAttributeValue("base", Namespace.XML_NAMESPACE);
351         if (xmlBase != null) {
352             entry.setXmlBase(xmlBase);
353         }
354         
355         Element e = eEntry.getChild("title",getAtomNamespace());
356         if (e!=null) {
357             Content c = new Content();
358             c.setValue(parseTextConstructToString(e));
359             c.setType(getAttributeValue(e, "type"));
360             entry.setTitleEx(c);
361         }
362         
363         List eList = eEntry.getChildren("link",getAtomNamespace());
364         entry.setAlternateLinks(parseAlternateLinks(feed, entry, baseURI, eList));
365         entry.setOtherLinks(parseOtherLinks(feed, entry, baseURI, eList));
366         
367         eList = eEntry.getChildren("author", getAtomNamespace());
368         if (eList.size()>0) {
369             entry.setAuthors(parsePersons(baseURI, eList));
370         }
371         
372         eList = eEntry.getChildren("contributor",getAtomNamespace());
373         if (eList.size()>0) {
374             entry.setContributors(parsePersons(baseURI, eList));
375         }
376         
377         e = eEntry.getChild("id",getAtomNamespace());
378         if (e!=null) {
379             entry.setId(e.getText());
380         }
381         
382         e = eEntry.getChild("updated",getAtomNamespace());
383         if (e!=null) {
384             entry.setUpdated(DateParser.parseW3CDateTime(e.getText()));
385         }
386         
387         e = eEntry.getChild("published",getAtomNamespace());
388         if (e!=null) {
389             entry.setPublished(DateParser.parseW3CDateTime(e.getText()));
390         }
391         
392         e = eEntry.getChild("summary",getAtomNamespace());
393         if (e!=null) {
394             entry.setSummary(parseContent(e));
395         }
396         
397         e = eEntry.getChild("content",getAtomNamespace());
398         if (e!=null) {
399             List contents = new ArrayList();
400             contents.add(parseContent(e));
401             entry.setContents(contents);
402         }
403         
404         e = eEntry.getChild("rights",getAtomNamespace());
405         if (e!=null) {
406             entry.setRights(e.getText());
407         }
408         
409         List cList = eEntry.getChildren("category",getAtomNamespace());
410         entry.setCategories(parseCategories(baseURI, cList));
411         
412         // TODO: SHOULD handle Atom entry source element
413         e = eEntry.getChild("source", getAtomNamespace());
414         if (e!=null) {
415             entry.setSource(parseFeedMetadata(baseURI, e));
416         }
417         
418         entry.setModules(parseItemModules(eEntry));
419         
420         List foreignMarkup =
421                 extractForeignMarkup(eEntry, entry, getAtomNamespace());
422         if (foreignMarkup.size() > 0) {
423             entry.setForeignMarkup(foreignMarkup);
424         }
425         return entry;
426     }
427     
428     private List parseCategories(String baseURI, List eCategories) {
429         List cats = new ArrayList();
430         for (int i=0;i<eCategories.size();i++) {
431             Element eCategory = (Element) eCategories.get(i);
432             cats.add(parseCategory(baseURI, eCategory));
433         }
434         return (cats.size()>0) ? cats : null;
435     }
436     
437     private Category parseCategory(String baseURI, Element eCategory) {
438         Category category = new Category();
439         String att = getAttributeValue(eCategory, "term");
440         if (att!=null) {
441             category.setTerm(att);
442         }
443         att = getAttributeValue(eCategory, "scheme");
444         if (att!=null) {
445             category.setScheme(att);
446             if (isRelativeURI(att)) {
447                 category.setSchemeResolved(resolveURI(baseURI, eCategory, att));
448             }
449         }
450         att = getAttributeValue(eCategory, "label");
451         if (att!=null) {
452             category.setLabel(att);
453         }
454         return category;
455         
456     }
457     
458     // Once following relative URI methods are made public in the ROME 
459     // Atom10Parser, then use them instead and delete these.
460     
461     
462     // Fix for issue #34 "valid IRI href attributes are stripped for atom:link"
463     // URI's that didn't start with http were being treated as relative URIs.
464     // So now consider an absolute URI to be any alpha-numeric string followed
465     // by a colon, followed by anything -- specified by this regex:
466     static Pattern absoluteURIPattern = Pattern.compile("^[a-z0-9]*:.*$");
467     
468     public static boolean isAbsoluteURI(String uri) {
469         return absoluteURIPattern.matcher(uri).find();
470     }
471     
472     /*** Returns true if URI is relative. */
473     public static boolean isRelativeURI(String uri) {
474         return !isAbsoluteURI(uri);
475     }
476         
477     /***
478      * Resolve URI via base URL and parent element.
479      * Resolve URI based considering xml:base and baseURI.
480      * @param baseURI Base URI used to fetch the XML document
481      * @param parent  Parent element from which to consider xml:base
482      * @param url     URL to be resolved
483      */
484     public static String resolveURI(String baseURI, Parent parent, String url) {
485         if (!resolveURIs) {
486             return url;
487         }
488         if (isRelativeURI(url)) {
489             url = (!".".equals(url) && !"./".equals(url)) ? url : "";
490             
491             if (url.startsWith("/") && baseURI != null) {
492                 String base = null;
493                 int slashslash = baseURI.indexOf("//");
494                 int nextslash = baseURI.indexOf("/", slashslash + 2);
495                 if (nextslash != -1) base = baseURI.substring(0, nextslash);
496                 return formURI(base, url);               
497             } 
498 
499             // Relative URI with parent
500             if (parent != null && parent instanceof Element) {
501 
502                 // Do we have an xml:base?         
503                 String xmlbase = ((Element)parent).getAttributeValue(
504                     "base", Namespace.XML_NAMESPACE);
505                 if (xmlbase != null && xmlbase.trim().length() > 0) {
506                     if (isAbsoluteURI(xmlbase)) {
507                         // Absolute xml:base, so form URI right now 
508                         if (url.startsWith("/")) { 
509                             // Host relative URI
510                             int slashslash = xmlbase.indexOf("//");
511                             int nextslash = xmlbase.indexOf("/", slashslash + 2);
512                             if (nextslash != -1) xmlbase = xmlbase.substring(0, nextslash);
513                             return formURI(xmlbase, url); 
514                         }
515                         if (!xmlbase.endsWith("/")) {
516                             // Base URI is filename, strip it off 
517                             xmlbase = xmlbase.substring(0, xmlbase.lastIndexOf("/"));
518                         }
519                         return formURI(xmlbase, url);
520                     } else {
521                         // Relative xml:base, so walk up tree
522                         return resolveURI(baseURI, parent.getParent(), 
523                             stripTrailingSlash(xmlbase) + "/"+ stripStartingSlash(url));
524                     }
525                 }
526                 // No xml:base so walk up tree
527                 return resolveURI(baseURI, parent.getParent(), url);
528 
529             // Relative URI with no parent (i.e. top of tree), so form URI right now
530             } else if (parent == null || parent instanceof Document) {
531                 return formURI(baseURI, url);        
532             } 
533         }                
534         return url;
535     }
536         
537     /***
538      * Find base URI of feed considering relative URIs.
539      * @param root Root element of feed.
540      */
541     private String findBaseURI(Element root) throws MalformedURLException {
542         String ret = null;
543         if (findAtomLink(root, "self") != null) {
544             ret = findAtomLink(root, "self");
545             if (".".equals(ret) || "./".equals(ret)) ret = "";
546             if (ret.indexOf("/") != -1) ret = ret.substring(0, ret.lastIndexOf("/"));
547             ret = resolveURI(null, root, ret);
548         }
549         return ret;
550     }
551     
552     /*** 
553      * Return URL string of Atom link element under parent element.
554      * Link with no rel attribute is considered to be rel="alternate"
555      * @param parent Consider only children of this parent element
556      * @param rel    Consider only links with this relationship
557      */
558     private  String findAtomLink(Element parent, String rel) {
559         String ret = null;
560         List linksList = parent.getChildren("link", ATOM_10_NS);
561         if (linksList != null) {
562             for (Iterator links = linksList.iterator(); links.hasNext(); ) {
563                 Element link = (Element)links.next();
564                 Attribute relAtt = getAttribute(link, "rel");
565                 Attribute hrefAtt = getAttribute(link, "href");
566                 if (   (relAtt == null && "alternate".equals(rel)) 
567                     || (relAtt != null && relAtt.getValue().equals(rel))) {
568                     ret = hrefAtt.getValue();
569                     break;
570                 }
571             }
572         }
573         return ret;
574     }
575         
576     /*** 
577      * Form URI by combining base with append portion and giving 
578      * special consideration to append portions that begin with ".."
579      * @param base   Base of URI, may end with trailing slash
580      * @param append String to append, may begin with slash or ".."
581      */
582     private static String formURI(String base, String append) {
583         base = stripTrailingSlash(base);
584         append = stripStartingSlash(append);
585         if (append.startsWith("..")) {
586             String ret = null;
587             String[] parts = append.split("/");
588             for (int i=0; i<parts.length; i++) {
589                 if ("..".equals(parts[i])) {
590                     int last = base.lastIndexOf("/");
591                     if (last != -1) {
592                         base = base.substring(0, last);
593                         append = append.substring(3, append.length());
594                     }
595                     else break;
596                 }
597             }
598         }
599         return base + "/" + append;
600     }
601     
602     /*** 
603      * Strip starting slash from beginning of string.
604      */
605     private static String stripStartingSlash(String s) {
606         if (s != null && s.startsWith("/")) {
607             s = s.substring(1, s.length());
608         }
609         return s;
610     }
611     
612     /*** 
613      * Strip trailing slash from end of string.
614      */
615     private static String stripTrailingSlash(String s) {
616         if (s != null && s.endsWith("/")) {
617             s = s.substring(0, s.length() - 1);
618         }
619         return s;
620     }    
621 
622     
623     /***
624      * Parse entry from reader.
625      */
626     public static Entry parseEntry(Reader rd, String baseURI)
627         throws JDOMException, IOException, IllegalArgumentException, FeedException {
628         // Parse entry into JDOM tree
629         SAXBuilder builder = new SAXBuilder();
630         Document entryDoc = builder.build(rd);
631         Element fetchedEntryElement = entryDoc.getRootElement();
632         fetchedEntryElement.detach();
633 
634         // Put entry into a JDOM document with 'feed' root so that Rome can handle it
635         Feed feed = new Feed();
636         feed.setFeedType("atom_1.0");
637         WireFeedOutput wireFeedOutput = new WireFeedOutput();
638         Document feedDoc = wireFeedOutput.outputJDom(feed);
639         feedDoc.getRootElement().addContent(fetchedEntryElement);
640         
641         if (baseURI != null) {
642             feedDoc.getRootElement().setAttribute("base", baseURI, Namespace.XML_NAMESPACE);
643         }
644         
645         WireFeedInput input = new WireFeedInput();
646         Feed parsedFeed = (Feed)input.build(feedDoc);
647         return (Entry)parsedFeed.getEntries().get(0);
648     } 
649 }
650 
651