001 /*
002 * JSPWikiToYamConverter.java
003 * Copyright (c) 1998-2008, The University of Sheffield.
004 *
005 * This code is from the GATE project (http://gate.ac.uk/) and is free
006 * software licenced under the GNU General Public License version 3. It is
007 * distributed without any warranty. For more details see COPYING.txt in the
008 * top level directory (or at http://gatewiki.sf.net/COPYING.txt).
009 */
010
011 package gate.yam.convert;
012
013 import gate.util.GateException;
014 import gate.yam.YamFile;
015 import org.jdom.*;
016 import org.jdom.filter.ContentFilter;
017 import org.jdom.filter.ElementFilter;
018 import org.jdom.filter.Filter;
019 import org.springframework.core.io.FileSystemResource;
020
021 import javax.xml.transform.TransformerException;
022 import java.io.*;
023 import java.util.*;
024 import java.util.regex.Pattern;
025 import java.nio.channels.FileChannel;
026
027 /**
028 * Convert JSPWiki syntax to YAM.
029 * @author Valentin Tablan
030 */
031 public class JSPWikiToYamConverter {
032
033 /** Encoding used when reading JSPWiki files and writing YAM files */
034 private static final String INPUT_ENCODING= "ISO-8859-1";
035 private static final String OUTPUT_ENCODING = "UTF-8";
036 //private static final String INPUT_ENCODING = "UTF-8";
037 //private static final String OUTPUT_ENCODING = INPUT_ENCODING;
038
039 /**
040 * Characters that should be escaped when generating Yam.
041 */
042 private static final char[] YAM_SPECIAL_CHARACTERS = "_*^".toCharArray();
043
044 /**
045 * Converts a JSPWiki page into YAM format.
046 * @param jspWikiSource the String representing the JSPWiki content
047 * @return a String representation of a YAM page
048 * @throws TransformerException if problems occurred while performing the XSL
049 * transformation
050 * @throws IOException if problems occurred while parsing the JSPWiki format
051 */
052 public static String stringToString(String jspWikiSource)
053 throws TransformerException, IOException{
054 Reader reader = new StringReader(jspWikiSource);
055 return readerToString(reader);
056 }
057
058 /**
059 * Converts text in JSPWiki format to YAM format.
060 * @param jspReader a reader that provides the JSPWiki content
061 * @return a String with YAM data
062 * @throws TransformerException
063 * @throws IOException
064 */
065 public static String readerToString(Reader jspReader)
066 throws TransformerException, IOException {
067
068 return readerToStringWithTitle(jspReader, null);
069 }
070
071
072 /**
073 * Converts text in JSPWiki format to YAM format, adding the given title
074 * to the document. If the title is null, none is added.
075 * @param jspReader a reader that provides the JSPWiki content
076 * @param title the title to give the YAM document
077 * @return a String with YAM data
078 * @throws TransformerException
079 * @throws IOException
080 */
081 public static String readerToStringWithTitle(Reader jspReader, String title)
082 throws TransformerException, IOException {
083
084 JSPWikiMarkupParser parser = new JSPWikiMarkupParser(jspReader);
085 Document jdomDoc = parser.parse();
086
087 // Various conversions are needed to allow for differences beterrn
088 // jspw syntax and yam syntax.
089 processHeadings(jdomDoc);
090 processEscapes(jdomDoc);
091 massageLinks(jdomDoc);
092 processEntityReferences(jdomDoc);
093 processSpecifics(jdomDoc);
094 if(title != null) addTitle(jdomDoc, title);
095
096 return HtmlToYamConverter.jdomToString(jdomDoc);
097 }
098
099
100 /**
101 * Find all headings in a DOM, and add an empty paragraph after all of
102 * those headings that don't have one already. JSPWiki headings are terminated
103 * by new lines, whereas YAM headings are terminated by blank lines - this
104 * method ensures a correct translation.
105 * @param jdomDoc The document in which headings will be adjusted
106 */
107 private static void processHeadings(org.jdom.Document jdomDoc){
108
109 // Pattern and Filter used to get all headings
110 final Pattern headingPattern = Pattern.compile("[Hh][123456]");
111 class HeadingFilter implements Filter {
112 public boolean matches(Object obj) {
113 if(!(obj instanceof Element)) return false;
114 Element el = (Element) obj;
115 return headingPattern.matcher(el.getName()).matches();
116 }
117 }
118
119 // Keep track of Elements we want to add empty paragraphs after. We will
120 // add after iterating over the Document.
121 List<Element> toAddParaAfter = new ArrayList<Element>();
122
123 // Iterate over all headings
124 for(Iterator hIt = jdomDoc.getDescendants(new HeadingFilter());
125 hIt.hasNext();) {
126
127 // Is the next sibling an empty para?
128 Element hEl = (Element) hIt.next();
129 Content next = getNextSibling(hEl);
130
131 boolean emptyPara = false;
132 if(next instanceof Element){
133 Element nextEl = (Element) next;
134 if(nextEl.getName().equalsIgnoreCase("p")
135 && nextEl.getChildren().isEmpty()){
136 emptyPara = true;
137 }
138 }
139
140 // It's not followed by an empty para. Keep track of it
141 // for later addition, once we've finished iterating
142 if(!emptyPara) {
143 toAddParaAfter.add(hEl);
144 } // end if
145
146 } // end for
147
148 // Now add the empty paras
149 for(Element hEl : toAddParaAfter){
150 Element parEl = hEl.getParentElement();
151 int hIndex = parEl.indexOf(hEl);
152 parEl.addContent(hIndex+1, new Element("p"));
153 }
154
155 } // end processHeadings
156
157 /**
158 * Adds some text content to the document, as the first child Element of the
159 * body Element. This will become the title in YAM syntax.
160 * @param jdomDoc The document to which text content will be added
161 * @param title The title to add as text content
162 */
163 private static void addTitle(org.jdom.Document jdomDoc, String title){
164
165 Element body = jdomDoc.getRootElement().getChild("body");
166
167 // Not sure why we have to add two empty paras here - something to do
168 // with the html JSPWikiMarkupParser produces?
169 body.addContent(0, new Text(title));
170 body.addContent(1, new Element("p"));
171 body.addContent(2, new Element("p"));
172
173 }
174
175 /**
176 * Find all local hrefs in a document, and massage them into yam form. Strip
177 * out the leading VIEW which is added by JSPWikiMarkupParser, and carried
178 * through to yam unless we remove them. Add a .html suffix.
179 * @param jdomDoc The document in which links will be adjusted
180 */
181 private static void massageLinks(org.jdom.Document jdomDoc){
182
183 // Go thorugh all anchors
184 for(Iterator aIt = jdomDoc.getDescendants(new ElementFilter("a"));
185 aIt.hasNext();){
186
187 // Get the href attribute
188 Element aEl = (Element) aIt.next();
189 String href = aEl.getAttributeValue("href");
190
191 if(href != null) {
192
193 // Remove leading VIEW and add html suffix
194 if(href.startsWith("VIEW")) {
195 href = href.substring(4) + ".html";
196 }
197
198 // Escape commas and spaces in URLs
199 href = href.replace(",", "\\,");
200 href = href.replace(" ", "\\ ");
201
202 // Set the new href
203 aEl.setAttribute("href", href);
204
205 }
206 }
207 }
208
209 /**
210 * Deal with specific one-off problems in JSPWiki to YAM conversion.
211 * Essentially, a load of hard coding to handle strange cases...
212 * @param jdomDoc The document to process
213 */
214 private static void processSpecifics(org.jdom.Document jdomDoc){
215
216 // Things to remove
217 List<Content> toRemove = new ArrayList<Content>();
218
219 // Get all text content
220 for(Iterator textIt =
221 jdomDoc.getDescendants(new ContentFilter(ContentFilter.TEXT));
222 textIt.hasNext();) {
223
224 Text text = (Text) textIt.next();
225 String content = text.getText();
226
227 // Remove references to JSPWiki group admin pages
228 if(content.contains("Group.jsp?group")){
229 toRemove.add(text);
230 }
231 }
232
233 for(Content remove : toRemove) {
234 Element parent = remove.getParentElement();
235 parent.removeContent(remove);
236
237 // If the parent was a list item and the content was its only child
238 // remove the item
239 if(parent.getName().equals("li") && parent.getChildren().size() == 0) {
240 Element grandParent = parent.getParentElement();
241 grandParent .removeContent(parent);
242 }
243
244
245 }
246
247
248 }
249
250 /**
251 * Special entities and their replacements
252 */
253 private static Map<String, String> SPECIAL_ENTITIES;
254 static {
255 SPECIAL_ENTITIES = new HashMap<String, String>();
256 SPECIAL_ENTITIES.put("<", "<");
257 SPECIAL_ENTITIES.put(">", ">");
258 SPECIAL_ENTITIES.put("&", "&");
259 SPECIAL_ENTITIES.put(""", "\"");
260 }
261
262 /**
263 * This method replaces all references to html special
264 * entities, in a DOM, with their legal yam characters.
265 * @param jdomDoc The document to process
266 */
267 private static void processEntityReferences(org.jdom.Document jdomDoc){
268
269 // Get all text content
270 for(Iterator textIt =
271 jdomDoc.getDescendants(new ContentFilter(ContentFilter.TEXT));
272 textIt.hasNext();) {
273
274 Text text = (Text) textIt.next();
275 String content = text.getText();
276
277 // Do the replacements. Could be faster...
278 for(String key : SPECIAL_ENTITIES.keySet()) {
279 content = content.replace(key, SPECIAL_ENTITIES.get(key));
280 }
281
282 // strip leading and trailing newlines but not other space characters
283 // from text nodes in list items
284 if (text.getParentElement().getName().equalsIgnoreCase("li"))
285 content = content.replaceAll("^(?:\\r?\\n)*", " ").replaceAll("(?:\\r?\\n)*$", " ");
286
287 // Set it back in the text
288 text.setText(content);
289 }
290 }
291
292 /**
293 * This method walks the whole DOM tree and, for each text node found, it
294 * escapes the YAM special characters. Adapted from HtmlToYamConverter.
295 *
296 * @param jdomDoc The document in which special characters will be escaped
297 */
298 private static void processEscapes(org.jdom.Document jdomDoc){
299
300 org.jdom.Content currentNode = jdomDoc.getRootElement();
301 boolean finished = false;
302 while(!finished){
303
304 //if the current node is a text node, fix it
305 if(currentNode instanceof org.jdom.Text && !currentNode.getParentElement().getName().equalsIgnoreCase("pre")){
306 org.jdom.Text textNode = (org.jdom.Text)currentNode;
307
308 //escape the special chars
309 String textData = textNode.getText();
310 for(char c : YAM_SPECIAL_CHARACTERS){
311 if(textData.indexOf(c) != -1){
312 textData=textData.replace(Character.toString(c), "\\" + c);
313 }
314 }
315 textNode.setText(textData);
316 }
317
318 //if it has children, start processing them
319 Content nextNode = null;
320 if(
321 currentNode instanceof Parent &&
322 ((Parent)currentNode).getContentSize() > 0
323 ) {
324 nextNode = ((Parent)currentNode).getContent(0);
325 }
326 if(nextNode == null){
327 //no children -> try siblings
328 nextNode = getNextSibling(currentNode);
329 if(nextNode == null){
330 //no siblings either -> do the backtrace till the first uncle
331 while(nextNode == null && ! finished){
332 Parent parent = currentNode.getParent();
333 if(parent == null || parent instanceof org.jdom.Document){
334 finished = true;
335 }else{
336 currentNode = (Content)parent;
337 nextNode = getNextSibling((Content)parent);
338 }
339 }
340 }
341 }
342 currentNode = nextNode;
343 }
344 }
345
346 /**
347 * Gets the sibling of a JDom node. Copied from HtmlToYamConverter
348 * @param node The node for which the next sibling will be returned
349 * @return The next sibling of node
350 */
351 private static Content getNextSibling(Content node){
352 Parent parent = node.getParent();
353 if(parent!= null){
354 int currentIndex = parent.indexOf(node);
355 if(parent.getContentSize() > (currentIndex + 1)){
356 return parent.getContent(currentIndex + 1);
357 }
358 }
359 return null;
360 }
361
362 /**
363 * Get the attachments to a JSPWiki file, copy them to a YAM wiki site, and
364 * list links to them at the end of the given yam file.
365 * @param jspwFile The JSPWiki text file from which attachments will be taken
366 * @param yamFile The YAM text file to which attachments will be added
367 */
368 private static void processAttachments(File jspwFile, File yamFile)
369 throws IOException{
370
371 // JSPWiki paths and names
372 String jspwFilePath = jspwFile.getAbsolutePath();
373 String jspwAttachDirPath
374 = jspwFilePath.substring(0, jspwFilePath.length() - 4);
375 File jspwAttachDir = new File(jspwAttachDirPath);
376
377 // YAM paths and names
378 String yamAttachDirName = jspwAttachDir.getName();
379 File yamAttachDir = new File(yamFile.getParent(), yamAttachDirName);
380
381 // Is there an attachment directory?
382 if(jspwAttachDir.isDirectory()) {
383
384 // List of file names to add to yam
385 List<String> yamAttachFileNames = new ArrayList<String>();
386
387 // Attach each content file, if it is an ordinary file and not hidden
388 for(File jspwAttachFile : jspwAttachDir.listFiles()) {
389 if(jspwAttachFile.isFile() && !jspwAttachFile.isHidden()) {
390
391 String yamAttachFileName = jspwAttachFile.getName();
392
393 // Save the name for sticking at the end of the YAM
394 yamAttachFileNames.add(yamAttachFileName);
395
396 // Copy the file
397 File yamAttachFile = new File(yamAttachDir, yamAttachFileName);
398 copy(jspwAttachFile, yamAttachFile);
399 }
400 }
401
402 // Make a list of attachments in yam
403 StringBuilder strB = new StringBuilder();
404 strB.append("---\n");
405 strB.append("%2* Attachments\n");
406 for(String fileName : yamAttachFileNames) {
407 strB.append("- %(");
408 strB.append(yamAttachDirName).append("/").append(fileName);
409 strB.append(", ").append(fileName).append(")\n");
410 }
411
412 // Append the list to the end of the yam file
413 PrintWriter pw = new PrintWriter(
414 new FileOutputStream(yamFile, true), true);
415 pw.append(strB.toString());
416 pw.close();
417
418 }
419
420 }
421
422 /**
423 * Copy one File to another.
424 * @param in The File that will be copied
425 * @param out The File to which in will be copied
426 * @throws IOException if the copy fails
427 */
428 private static void copy(File in, File out) throws IOException {
429
430 if(!out.getParentFile().isDirectory()) out.getParentFile().mkdirs();
431 FileChannel ic = new FileInputStream(in).getChannel();
432 FileChannel oc = new FileOutputStream(out).getChannel();
433 ic.transferTo(0, ic.size(), oc);
434 ic.close();
435 oc.close();
436 }
437
438
439
440 /**
441 * A FilenameFilter that accepts JSPWiki .txt source files.
442 */
443 private static class JSPWikiFileFilter implements FilenameFilter {
444 /** Accept a file if it is a .txt file*/
445 public boolean accept(File dir, String name) {
446 return name.endsWith(".txt");
447 }
448 }
449
450 /**
451 * Run the JSPWikiToYamConverter, translating the files specified on the
452 * command line from JSPWiki to YAM format.
453 * @param args (JSPWiki file | JSPWiki directory) [output directory]
454 */
455 public static void main(String[] args){
456
457 if(args.length < 1) {
458 printUsage();
459 System.exit(1);
460 }
461
462 // Get the files to convert
463 File inFile = new File(args[0]);
464 List<File> filesToConvert = new ArrayList<File>();
465 if(inFile.isFile()){
466
467 // If args[0] is a normal file, convert it to a yam file.
468 filesToConvert.add(inFile);
469
470 } else if(inFile.isDirectory()) {
471
472 // If it is a directory, convert every .txt file within to a yam file.
473 File[] filesInDir = inFile.listFiles(new JSPWikiFileFilter());
474 filesToConvert.addAll(Arrays.asList(filesInDir));
475
476 } else {
477 // If it is neither a normal file nor a directory
478 printUsage();
479 System.exit(1);
480 }
481
482 // Where shall we write output?
483 String outDirName = null;
484 if(args.length > 1) {
485 outDirName = args[1];
486 if(! new File(outDirName).isDirectory()) {
487 printUsage();
488 System.exit(1);
489 }
490 }
491
492 // Collect errors
493 List<String> errors = new ArrayList<String>();
494
495 // Collect all the yam files for generation. We do this
496 // as a separate step to ensure linked files are present
497 List<YamFile> yamsToGenerate = new ArrayList<YamFile>();
498
499 // Translate each file
500 for(File jspwFile : filesToConvert){
501
502 // Make a yam disk file. It will have the same filename prefix as its
503 // source, and a .yam suffix.
504 String jspwFileName = jspwFile.getName();
505 String prefix = jspwFileName.substring(0, jspwFileName.length() - 4);
506 String yamFileName = prefix + ".yam";
507 File yamDiskFile = new File(outDirName, yamFileName);
508
509 try {
510
511
512 //---------------------------------------------------------
513 // uncomment this section if you need to see the
514 // JSPWikiMarkupParser's dom.
515 //---------------------------------------------------------
516 // JSPWikiMarkupParser parser =
517 // new JSPWikiMarkupParser(
518 // new InputStreamReader(new FileInputStream(jspwFile),
519 // INPUT_ENCODING));
520 // Document jdomDoc = parser.parse();
521 // org.jdom.output.XMLOutputter jspwOut
522 // = new org.jdom.output.XMLOutputter();
523 // jspwOut.output(jdomDoc,
524 // new PrintWriter(new File(outDirName,
525 // prefix + ".jspw.html"),
526 // OUTPUT_ENCODING));
527 //---------------------------------------------------------
528
529 // Make the YAM disk file
530 System.out.println("Translating " + jspwFileName);
531 PrintWriter yamOut = new PrintWriter(yamDiskFile, OUTPUT_ENCODING);
532 Reader reader = new InputStreamReader(
533 new FileInputStream(jspwFile), INPUT_ENCODING);
534
535 yamOut.println(readerToStringWithTitle(reader, prefix));
536 yamOut.flush();
537
538 // Make the YamFile from the disk file
539 YamFile yamFile = YamFile.get(
540 new FileSystemResource(yamDiskFile.getCanonicalPath()));
541
542 // Attachments. These are not listed in the JSPWiki text file, but need
543 // to be added to the end of the YAM.
544 processAttachments(jspwFile, yamDiskFile);
545
546 //yamFile.generate();
547 yamFile.setContextPath(outDirName);
548 yamsToGenerate.add(yamFile);
549
550 } catch(Exception e) {
551 e.printStackTrace();
552 errors.add(yamFileName + ": " + e.toString());
553 }
554
555 // Generate - we do this once we have all yam files are present, to make
556 // sure linked files are present
557 for(YamFile yam : yamsToGenerate){
558 try{
559 yam.generate();
560 } catch(GateException ge){
561 ge.printStackTrace();
562 errors.add(yam + ": " + ge.toString());
563 }
564 }
565
566 } // end of for
567
568 // Report the errors
569 System.out.println("Translation finished with " + errors.size()
570 + " errors");
571 for(String error : errors) {
572 System.out.println(error);
573 }
574
575 } // end of main
576
577 /**
578 * Print the command line usage of th is class to standard out.
579 */
580 private static void printUsage() {
581 System.out.println("JSPWikiToYamConverter - convert JSPWiki files to YAM");
582 System.out.println("Usage:");
583 System.out.println(" JSPWikiToYamConverter (file|directory) [outputDir]");
584 System.out.println(" file: JSPWiki file to translate");
585 System.out.println(" directory: directory of files to translate");
586 System.out.println(" outputDir: directory to write YAM files to");
587 System.out.println(" (defaults to current directory)");
588 }
589
590
591
592 } //
|