001 /*
002 * HtmlToYamConverter.java
003 * Copyright (c) 1998-2008, The University of Sheffield.
004 *
005 * This code is from the GATE project (http://gate.ac.uk/) and is free
006 * software licenced under the GNU General Public License version 3. It is
007 * distributed without any warranty. For more details see COPYING.txt in the
008 * top level directory (or at http://gatewiki.sf.net/COPYING.txt).
009 */
010
011 package gate.yam.convert;
012
013 import java.io.*;
014 import java.net.URL;
015 import java.util.ArrayList;
016 import java.util.Arrays;
017 import java.util.HashSet;
018 import java.util.Iterator;
019 import java.util.List;
020 import java.util.Set;
021
022 import javax.xml.transform.*;
023 import javax.xml.transform.dom.DOMSource;
024 import javax.xml.transform.stream.StreamResult;
025 import javax.xml.transform.stream.StreamSource;
026
027 import org.apache.log4j.Logger;
028 import org.cyberneko.html.parsers.DOMParser;
029 import org.jdom.Content;
030 import org.jdom.Element;
031 import org.jdom.Parent;
032 import org.jdom.filter.Filter;
033 import org.jdom.input.DOMBuilder;
034 import org.jdom.output.Format;
035 import org.jdom.output.XMLOutputter;
036 import org.jdom.transform.JDOMSource;
037 import org.w3c.dom.*;
038 import org.xml.sax.InputSource;
039 import org.xml.sax.SAXException;
040
041 /**
042 * Convert HTML to YAM. The bulk of the conversion work is done by an XSLT
043 * stylesheet, but there is a small amount of pre-processing done in Java to
044 * fix up things that are very difficult or impossible to do in XSLT. In
045 * particular, for lists that are nested inside other lists, e.g.:
046 * <pre>
047 * <ul>
048 * <li>A list item
049 * <ul>
050 * <li>Nested list</li>
051 * </ul></li>
052 * </ul>
053 * </pre>
054 *
055 * we must strip the whitespace between the parent <code>li</code> text ("A
056 * list item<newline><four spaces>") and the opening nested
057 * <code>ul</code> tag, otherwise the list nesting is lost in the generated
058 * yam.
059 *
060 * @author Valentin Tablan, modified by Ian Roberts
061 */
062 public class HtmlToYamConverter {
063
064 private static final Logger log = Logger.getLogger(HtmlToYamConverter.class);
065
066 /**
067 * The encoding used for the XSL documents
068 */
069 private static final String XSL_ENCODING = "UTF-8";
070
071 /**
072 * The XSL transformer used for HTML to YAM conversions
073 */
074 private static Transformer transformer;
075
076 /**
077 * Set containing the HTML element names that represent lists. Tag names
078 * must be in upper case, as the DOM documents produced by NekoHTML report
079 * their tag names in upper case regardless of the original case used in the
080 * HTML.
081 */
082 private static Set<String> listTags = new HashSet<String>(Arrays.asList(
083 new String[] { "OL", "UL" }));
084
085 private static void initTransformer(){
086 InputStream xslIs =
087 HtmlToYamConverter.class.getResourceAsStream("xhtml2yam.xsl");
088 Reader styleSheetReader;
089 try {
090 styleSheetReader = new InputStreamReader(xslIs, XSL_ENCODING);
091 transformer = TransformerFactory.newInstance().
092 newTransformer(new StreamSource(styleSheetReader));
093 } catch(UnsupportedEncodingException e) {
094 //this should never happen
095 throw new RuntimeException("Invalid encoding used:", e);
096 } catch(TransformerConfigurationException e) {
097 //this should never happen
098 throw new RuntimeException("Invalid XSL configuration:", e);
099 } catch(TransformerFactoryConfigurationError e) {
100 //this should never happen
101 throw new RuntimeException("Invalid XSL factory configuration:", e);
102 }
103 }
104
105 /**
106 * Converts HTML source provided as String to YAM format returned as
107 * String.
108 * @param htmlSource the String representation of the input HTML document.
109 * @return a String representing a document in YAM format
110 * @throws SAXException
111 * @throws IOException
112 * @throws TransformerException
113 */
114 public static String stringToString(String htmlSource) throws SAXException,
115 IOException, TransformerException {
116 return readerToString(new StringReader(htmlSource));
117 }
118
119 /**
120 * Converts HTML source provided from a reader to YAM format returned as
121 * string.
122 * @param htmlReader the Reader supplying the html source document
123 * @return a String representing a document in YAM format
124 * @throws SAXException
125 * @throws IOException
126 * @throws TransformerException
127 */
128 public static String readerToString(Reader htmlReader)
129 throws SAXException, IOException, TransformerException {
130 //first get the input DOM
131 DOMParser parser = new DOMParser();
132 parser.setFeature("http://xml.org/sax/features/namespaces", false);
133 parser.parse(new InputSource(htmlReader));
134 Document domDoc = parser.getDocument();
135 //now transform it
136 return domToString(domDoc);
137 // test - use JDOM instead
138 //return jdomToString(new DOMBuilder().build(domDoc));
139 }
140
141 /**
142 * Transforms a DOM document into a String representation in YAM format.
143 * Does some minor pre-processing of the DOM tree to clean up some things
144 * that are extremely difficult in XSLT.
145 * @param input the input DOM document, in HTML
146 * @return a String value with the parsed results
147 * @throws TransformerException
148 */
149 public static synchronized String domToString(Document input) throws TransformerException
150 {
151 if(transformer == null) initTransformer();
152 //when debugging it's useful to see the actual input to the XSL transformer
153 //if you need that, uncomment the following lines
154 //try {
155 // System.err.println("DOM Document:\n======================================");
156 // org.jdom.Document jdomInput = new DOMBuilder().build(input);
157 // new XMLOutputter(Format.getPrettyFormat()).output(jdomInput, System.err);
158 // System.err.println("\n======================================");
159 //} catch(IOException e) {
160 // e.printStackTrace();
161 //}
162
163 // find any nested lists
164 for(String listTag : listTags) {
165 NodeList lists = input.getDocumentElement().getElementsByTagName(
166 listTag);
167 log.debug("Found " + lists.getLength() + " elements with name " + listTag);
168 for(int i = 0; i < lists.getLength(); ++i) {
169 org.w3c.dom.Element elt = (org.w3c.dom.Element)lists.item(i);
170 String parentName = elt.getParentNode().getNodeName();
171 if(parentName.equalsIgnoreCase("li")) {
172 // this is a nested list, strip trailing whitespace off the
173 // immediately preceding text node.
174 log.debug("Found nested list " + elt);
175 Node preceding = elt.getPreviousSibling();
176 if(preceding != null && preceding instanceof org.w3c.dom.Text) {
177 log.debug("Stripping trailing whitespace from \"" + preceding + "\"");
178 String text = ((org.w3c.dom.Text)preceding).getData();
179 String strippedText = text.replaceAll("\\s*$", "");
180 if(!strippedText.equals(text)) {
181 org.w3c.dom.Text newText = input.createTextNode(strippedText);
182 elt.getParentNode().replaceChild(newText, preceding);
183 }
184 }
185 }
186 }
187 }
188
189 StringWriter resultStr = new StringWriter();
190 transformer.transform(new DOMSource(input), new StreamResult(resultStr));
191 return resultStr.toString();
192 }
193
194 /**
195 * Transforms a DOM document into a String representation in YAM format.
196 * Does some minor pre-processing of the JDOM tree to clean up some things
197 * that are extremely difficult in XSLT.
198 * @param input the jDom document, in HTML
199 * @return a String in YAM format
200 * @throws TransformerException
201 */
202 public static synchronized String jdomToString(org.jdom.Document input)
203 throws TransformerException {
204 if(transformer == null) initTransformer();
205
206 log.debug("Searching for nested lists");
207 Iterator<org.jdom.Element> nestedLists = input.getDescendants(new Filter() {
208 public boolean matches(Object o) {
209 if(!(o instanceof org.jdom.Element)) {
210 return false;
211 }
212 org.jdom.Element e = (org.jdom.Element)o;
213 if(listTags.contains(e.getName())) {
214 if(e.getParent() instanceof org.jdom.Element) {
215 org.jdom.Element parent = (org.jdom.Element)e.getParent();
216 if("li".equalsIgnoreCase(parent.getName())) {
217 // e is a nested list
218 return true;
219 }
220 }
221 }
222 return false;
223 }
224 });
225
226 while(nestedLists.hasNext()) {
227 org.jdom.Element list = nestedLists.next();
228 log.debug("Found nested list " + list);
229 org.jdom.Element parentLi = (org.jdom.Element)list.getParent();
230 int index = parentLi.indexOf(list);
231 if(index > 0) {
232 Content preceding = parentLi.getContent(index - 1);
233 if(preceding instanceof org.jdom.Text) {
234 log.debug("Stripping trailing whitespace from " + preceding);
235 org.jdom.Text text = (org.jdom.Text)preceding;
236 text.setText(text.getText().replaceAll("\\s*$", ""));
237 }
238 }
239 }
240
241
242 StringWriter resultStr = new StringWriter();
243 transformer.transform(new JDOMSource(input), new StreamResult(resultStr));
244 return resultStr.toString();
245 }
246
247
248
249 /**
250 * Test code - DO NOT USE!
251 * Given a html file arg[0], writes out its yam file to the directory arg[1]
252 * @param args
253 */
254 public static void main(String[] args) throws Exception {
255 // URL pageUrl = new URL("http://gate.ac.uk");
256 // String inputEncoding = "ISO-8859-1";
257 // Reader reader = new InputStreamReader(pageUrl.openStream(),
258 // inputEncoding);
259 // System.out.println(readerToString(reader));
260
261 String encoding = "ISO-8859-1";
262 File inFile = new File(args[0]);
263 String inFileName = inFile.getName();
264 String inFilePrefix = inFileName.substring(0, inFileName.length() - 5);
265 File outFile = new File(args[1], inFilePrefix + ".yam");
266
267 System.out.println("Translating " + inFile + " to " + outFile);
268
269 Reader in = new InputStreamReader(new FileInputStream(inFile), encoding);
270 PrintWriter out = new PrintWriter(outFile, encoding);
271
272 out.println(readerToString(in));
273 out.flush();
274
275 }
276 }
|