gate/yam/convert/JSPWikiToYamConverter.java


JSPWikiToYamConverter.java


001 /*

002  *  JSPWikiToYamConverter.java

003  *  Copyright (c) 1998-2008, The University of Sheffield.

004  *

005  *  This code is from the GATE project (http://gate.ac.uk/) and is free

006  *  software licenced under the GNU General Public License version 3. It is

007  *  distributed without any warranty. For more details see COPYING.txt in the

008  *  top level directory (or at http://gatewiki.sf.net/COPYING.txt).

009  */

010 

011 package gate.yam.convert;

012 

013 import gate.util.GateException;

014 import gate.yam.YamFile;

015 import org.jdom.*;

016 import org.jdom.filter.ContentFilter;

017 import org.jdom.filter.ElementFilter;

018 import org.jdom.filter.Filter;

019 import org.springframework.core.io.FileSystemResource;

020 

021 import javax.xml.transform.TransformerException;

022 import java.io.*;

023 import java.util.*;

024 import java.util.regex.Pattern;

025 import java.nio.channels.FileChannel;

026 

027 /** 

028  * Convert JSPWiki syntax to YAM.

029  * @author Valentin Tablan

030  */

031 public class JSPWikiToYamConverter {

032 

033   /** Encoding used when reading JSPWiki files and writing YAM files */

034   private static final String INPUT_ENCODING= "ISO-8859-1";

035   private static final String OUTPUT_ENCODING = "UTF-8";

036   //private static final String INPUT_ENCODING = "UTF-8";

037   //private static final String OUTPUT_ENCODING = INPUT_ENCODING;

038 

039   /**

040    * Characters that should be escaped when generating Yam.

041    */

042   private static final char[] YAM_SPECIAL_CHARACTERS = "_*^".toCharArray();

043 

044   /**

045    * Converts a JSPWiki page into YAM format.

046    * @param jspWikiSource the String representing the JSPWiki content

047    * @return a String representation of a YAM page

048    * @throws TransformerException if problems occurred while performing the XSL

049    * transformation

050    * @throws IOException if problems occurred while parsing the JSPWiki format 

051    */

052   public static String stringToString(String jspWikiSource) 

053       throws TransformerException, IOException{

054     Reader reader = new StringReader(jspWikiSource);

055     return readerToString(reader);

056   }

057 

058   /**

059    * Converts text in JSPWiki format to YAM format.

060    * @param jspReader a reader that provides the JSPWiki content

061    * @return a String with YAM data

062    * @throws TransformerException

063    * @throws IOException

064    */

065   public static String readerToString(Reader jspReader)

066           throws TransformerException, IOException {

067 

068     return readerToStringWithTitle(jspReader, null);

069   }

070 

071 

072   /**

073    * Converts text in JSPWiki format to YAM format, adding the given title

074    * to the document. If the title is null, none is added.

075    * @param jspReader a reader that provides the JSPWiki content

076    * @param title the title to give the YAM document

077    * @return a String with YAM data

078    * @throws TransformerException

079    * @throws IOException

080    */

081   public static String readerToStringWithTitle(Reader jspReader, String title)

082           throws TransformerException, IOException {

083     

084     JSPWikiMarkupParser parser = new JSPWikiMarkupParser(jspReader);

085     Document jdomDoc = parser.parse();

086 

087     // Various conversions are needed to allow for differences beterrn

088     // jspw syntax and yam syntax.

089     processHeadings(jdomDoc);

090     processEscapes(jdomDoc);

091     massageLinks(jdomDoc);

092     processEntityReferences(jdomDoc);

093     processSpecifics(jdomDoc);

094     if(title != null) addTitle(jdomDoc, title);

095     

096     return HtmlToYamConverter.jdomToString(jdomDoc);

097   }

098 

099 

100   /**

101    * Find all headings in a DOM, and add an empty paragraph after all of

102    * those headings that don't have one already. JSPWiki headings are terminated

103    * by new lines, whereas YAM headings are terminated by blank lines - this

104    * method ensures a correct translation.

105    * @param jdomDoc The document in which headings will be adjusted

106    */

107   private static void processHeadings(org.jdom.Document jdomDoc){

108 

109     // Pattern and Filter used to get all headings

110     final Pattern headingPattern = Pattern.compile("[Hh][123456]");

111     class HeadingFilter implements Filter {

112        public boolean matches(Object obj) {

113          if(!(obj instanceof Element)) return false;

114          Element el = (Element) obj;

115          return headingPattern.matcher(el.getName()).matches();

116       }

117     }

118 

119     // Keep track of Elements we want to add empty paragraphs after. We will

120     // add after iterating over the Document.

121     List<Element> toAddParaAfter = new ArrayList<Element>();

122 

123     // Iterate over all headings

124     for(Iterator hIt = jdomDoc.getDescendants(new HeadingFilter());

125         hIt.hasNext();) {

126 

127       // Is the next sibling an empty para?

128       Element hEl = (Element) hIt.next();

129       Content next = getNextSibling(hEl);

130 

131       boolean emptyPara = false;

132       if(next instanceof Element){

133         Element nextEl = (Element) next;

134         if(nextEl.getName().equalsIgnoreCase("p")

135                 && nextEl.getChildren().isEmpty()){

136           emptyPara = true;

137         }

138       }

139 

140       // It's not followed by an empty para. Keep track of it

141       // for later addition, once we've finished iterating

142       if(!emptyPara) {

143         toAddParaAfter.add(hEl);

144       } // end if

145 

146     } // end for

147 

148     // Now add the empty paras

149     for(Element hEl : toAddParaAfter){

150       Element parEl = hEl.getParentElement();

151       int hIndex = parEl.indexOf(hEl);

152       parEl.addContent(hIndex+1, new Element("p"));

153     }

154     

155   } // end processHeadings

156 

157   /**

158    * Adds some text content to the document, as the first child Element of the

159    * body Element. This will become the title in YAM syntax.

160    * @param jdomDoc The document to which text content will be added

161    * @param title The title to add as text content

162    */

163   private static void addTitle(org.jdom.Document jdomDoc, String title){

164 

165     Element body = jdomDoc.getRootElement().getChild("body");

166     

167     // Not sure why we have to add two empty paras here - something to do

168     // with the html JSPWikiMarkupParser produces?

169     body.addContent(0, new Text(title));

170     body.addContent(1, new Element("p"));

171     body.addContent(2, new Element("p"));

172 

173   }

174 

175   /**

176    * Find all local hrefs in a document, and massage them into yam form. Strip

177    * out the leading VIEW which is added by JSPWikiMarkupParser, and carried

178    * through to yam unless we remove them. Add a .html suffix.

179    * @param jdomDoc The document in which links will be adjusted

180    */

181   private static void massageLinks(org.jdom.Document jdomDoc){

182 

183     // Go thorugh all anchors

184     for(Iterator aIt = jdomDoc.getDescendants(new ElementFilter("a"));

185         aIt.hasNext();){

186 

187       // Get the href attribute

188       Element aEl = (Element) aIt.next();

189       String href = aEl.getAttributeValue("href");

190 

191       if(href != null) {

192 

193         // Remove leading VIEW and add html suffix

194         if(href.startsWith("VIEW")) {

195           href = href.substring(4) + ".html";

196         }

197 

198         // Escape commas and spaces in URLs

199         href = href.replace(",", "\\,");

200         href = href.replace(" ", "\\ ");

201 

202         // Set the new href

203         aEl.setAttribute("href", href);

204 

205       }

206     }

207   }

208 

209   /**

210    * Deal with specific one-off problems in JSPWiki to YAM conversion.

211    * Essentially, a load of hard coding to handle strange cases...

212    * @param jdomDoc The document to process

213    */

214   private static void processSpecifics(org.jdom.Document jdomDoc){

215 

216     // Things to remove

217     List<Content> toRemove = new ArrayList<Content>();

218 

219     // Get all text content

220     for(Iterator textIt =

221             jdomDoc.getDescendants(new ContentFilter(ContentFilter.TEXT));

222         textIt.hasNext();) {

223 

224       Text text = (Text) textIt.next();

225       String content = text.getText();

226 

227       // Remove references to JSPWiki group admin pages

228       if(content.contains("Group.jsp?group")){

229         toRemove.add(text);

230       }

231     }

232 

233     for(Content remove : toRemove) {

234       Element parent = remove.getParentElement();

235       parent.removeContent(remove);

236 

237       // If the parent was a list item and the content was its only child

238       // remove the item

239       if(parent.getName().equals("li") && parent.getChildren().size() == 0) {

240         Element grandParent = parent.getParentElement();

241         grandParent .removeContent(parent);

242       }

243 

244 

245     }

246     

247 

248   }

249 

250   /**

251    * Special entities and their replacements

252    */

253   private static Map<String, String> SPECIAL_ENTITIES;

254   static {

255     SPECIAL_ENTITIES = new HashMap<String, String>();

256     SPECIAL_ENTITIES.put("&lt;", "<");

257     SPECIAL_ENTITIES.put("&gt;", ">");

258     SPECIAL_ENTITIES.put("&amp;", "&");

259     SPECIAL_ENTITIES.put("&quot;", "\"");

260   }

261 

262   /**

263    * This method replaces all references to html special

264    * entities, in a DOM, with their legal yam characters.

265    * @param jdomDoc The document to process

266    */

267   private static void processEntityReferences(org.jdom.Document jdomDoc){

268 

269     // Get all text content

270     for(Iterator textIt =

271             jdomDoc.getDescendants(new ContentFilter(ContentFilter.TEXT));

272         textIt.hasNext();) {

273 

274       Text text = (Text) textIt.next();

275       String content = text.getText();

276 

277       // Do the replacements. Could be faster...

278       for(String key : SPECIAL_ENTITIES.keySet()) {

279         content = content.replace(key, SPECIAL_ENTITIES.get(key));

280       }

281 

282       // strip leading and trailing newlines but not other space characters

283       // from text nodes in list items

284       if (text.getParentElement().getName().equalsIgnoreCase("li"))

285         content = content.replaceAll("^(?:\\r?\\n)*", " ").replaceAll("(?:\\r?\\n)*$", " ");

286       

287       // Set it back in the text

288       text.setText(content);

289     }

290   }

291 

292   /**

293    * This method walks the whole DOM tree and, for each text node found, it

294    * escapes the YAM special characters. Adapted from HtmlToYamConverter.

295    *

296    * @param jdomDoc The document in which special characters will be escaped

297    */

298   private static void processEscapes(org.jdom.Document jdomDoc){

299    

300     org.jdom.Content currentNode = jdomDoc.getRootElement();

301     boolean finished = false;

302     while(!finished){

303 

304       //if the current node is a text node, fix it

305       if(currentNode instanceof org.jdom.Text && !currentNode.getParentElement().getName().equalsIgnoreCase("pre")){

306         org.jdom.Text textNode = (org.jdom.Text)currentNode;

307 

308         //escape the special chars

309         String textData = textNode.getText();

310         for(char c : YAM_SPECIAL_CHARACTERS){

311           if(textData.indexOf(c) != -1){

312             textData=textData.replace(Character.toString(c), "\\" + c);

313           }

314         }

315         textNode.setText(textData);

316       }

317 

318       //if it has children, start processing them

319       Content nextNode = null;

320       if(

321         currentNode instanceof Parent &&

322         ((Parent)currentNode).getContentSize() > 0

323       ) {

324         nextNode = ((Parent)currentNode).getContent(0);

325       }

326       if(nextNode == null){

327         //no children -> try siblings

328         nextNode = getNextSibling(currentNode);

329         if(nextNode == null){

330           //no siblings either -> do the backtrace till the first uncle

331           while(nextNode == null && ! finished){

332             Parent parent = currentNode.getParent();

333             if(parent == null || parent instanceof org.jdom.Document){

334               finished = true;

335             }else{

336               currentNode = (Content)parent;

337               nextNode = getNextSibling((Content)parent);

338             }

339           }

340         }

341       }

342       currentNode = nextNode;

343     }

344   }

345 

346   /**

347    * Gets the sibling of a JDom node. Copied from HtmlToYamConverter

348    * @param node The node for which the next sibling will be returned

349    * @return The next sibling of node

350    */

351   private static Content getNextSibling(Content node){

352     Parent parent = node.getParent();

353     if(parent!= null){

354       int currentIndex = parent.indexOf(node);

355       if(parent.getContentSize() > (currentIndex + 1)){

356         return parent.getContent(currentIndex + 1);

357       }

358     }

359     return null;

360   }

361 

362   /**

363    * Get the attachments to a JSPWiki file, copy them to a YAM wiki site, and

364    * list links to them at the end of the given yam file.

365    * @param jspwFile The JSPWiki text file from which attachments will be taken

366    * @param yamFile The YAM text file to which attachments will be added

367    */

368   private static void processAttachments(File jspwFile, File yamFile)

369           throws IOException{

370 

371     // JSPWiki paths and names

372     String jspwFilePath = jspwFile.getAbsolutePath();

373     String jspwAttachDirPath

374             = jspwFilePath.substring(0, jspwFilePath.length() - 4);

375     File jspwAttachDir = new File(jspwAttachDirPath);

376 

377     // YAM paths and names

378     String yamAttachDirName = jspwAttachDir.getName();

379     File yamAttachDir = new File(yamFile.getParent(), yamAttachDirName);

380 

381     // Is there an attachment directory?

382     if(jspwAttachDir.isDirectory()) {

383 

384       // List of file names to add to yam

385       List<String> yamAttachFileNames = new ArrayList<String>();

386 

387       // Attach each content file, if it is an ordinary file and not hidden

388       for(File jspwAttachFile : jspwAttachDir.listFiles()) {

389         if(jspwAttachFile.isFile() && !jspwAttachFile.isHidden()) {

390 

391           String yamAttachFileName = jspwAttachFile.getName();

392 

393           // Save the name for sticking at the end of the YAM

394           yamAttachFileNames.add(yamAttachFileName);

395 

396           // Copy the file

397           File yamAttachFile = new File(yamAttachDir, yamAttachFileName);

398           copy(jspwAttachFile, yamAttachFile);

399         }

400       }

401 

402       // Make a list of attachments in yam

403       StringBuilder strB = new StringBuilder();

404       strB.append("---\n");

405       strB.append("%2* Attachments\n");

406       for(String fileName : yamAttachFileNames) {

407         strB.append("- %(");

408         strB.append(yamAttachDirName).append("/").append(fileName);

409         strB.append(", ").append(fileName).append(")\n");

410       }

411 

412       // Append the list to the end of the yam file

413       PrintWriter pw = new PrintWriter(

414               new FileOutputStream(yamFile, true), true);

415       pw.append(strB.toString());

416       pw.close();

417       

418     }

419 

420   }

421 

422   /**

423    * Copy one File to another.

424    * @param in The File that will be copied

425    * @param out The File to which in will be copied

426    * @throws IOException if the copy fails

427    */

428   private static void copy(File in, File out) throws IOException {

429 

430       if(!out.getParentFile().isDirectory()) out.getParentFile().mkdirs();

431       FileChannel ic = new FileInputStream(in).getChannel();

432       FileChannel oc = new FileOutputStream(out).getChannel();

433       ic.transferTo(0, ic.size(), oc);

434       ic.close();

435       oc.close();

436   }

437 

438 

439 

440   /**

441    * A FilenameFilter that accepts JSPWiki .txt source files.

442    */

443   private static class JSPWikiFileFilter implements FilenameFilter {

444     /** Accept a file if it is a .txt file*/

445     public boolean accept(File dir, String name) {

446       return name.endsWith(".txt");

447     }

448   }

449 

450   /**

451    * Run the JSPWikiToYamConverter, translating the files specified on the

452    * command line from JSPWiki to YAM format.

453    * @param args (JSPWiki file | JSPWiki directory) [output directory]

454    */

455   public static void main(String[] args){

456 

457     if(args.length < 1) {

458       printUsage();

459       System.exit(1);

460     }

461 

462     // Get the files to convert

463     File inFile = new File(args[0]);

464     List<File> filesToConvert = new ArrayList<File>();

465     if(inFile.isFile()){

466 

467       // If args[0] is a normal file, convert it to a yam file.

468       filesToConvert.add(inFile);

469 

470     } else if(inFile.isDirectory()) {

471 

472       //  If it is a directory, convert every .txt file within to a yam file.

473       File[] filesInDir = inFile.listFiles(new JSPWikiFileFilter());

474       filesToConvert.addAll(Arrays.asList(filesInDir));

475       

476     } else {

477       // If it is neither a normal file nor a directory

478       printUsage();

479       System.exit(1);

480     }

481 

482     // Where shall we write output?

483     String outDirName = null;

484     if(args.length > 1) {

485       outDirName = args[1];

486       if(! new File(outDirName).isDirectory()) {

487         printUsage();

488         System.exit(1);

489       }

490     }

491 

492     // Collect errors

493     List<String> errors = new ArrayList<String>();

494 

495     // Collect all the yam files for generation. We do this

496     // as a separate step to ensure linked files are present

497     List<YamFile> yamsToGenerate = new ArrayList<YamFile>();

498 

499     // Translate each file

500     for(File jspwFile : filesToConvert){

501 

502       // Make a yam disk file. It will have the same filename prefix as its

503       // source, and a .yam suffix.

504       String jspwFileName = jspwFile.getName();

505       String prefix = jspwFileName.substring(0, jspwFileName.length() - 4);

506       String yamFileName = prefix + ".yam";

507       File yamDiskFile = new File(outDirName, yamFileName);

508 

509       try {

510 

511 

512         //---------------------------------------------------------

513         // uncomment this section if you need to see the

514         // JSPWikiMarkupParser's dom.

515         //---------------------------------------------------------

516  //       JSPWikiMarkupParser parser =

517  //               new JSPWikiMarkupParser(

518  //                       new InputStreamReader(new FileInputStream(jspwFile),

519  //                                             INPUT_ENCODING));

520  //       Document jdomDoc = parser.parse();

521  //       org.jdom.output.XMLOutputter jspwOut

522  //               = new org.jdom.output.XMLOutputter();

523  //       jspwOut.output(jdomDoc,

524  //                      new PrintWriter(new File(outDirName,

525  //                                               prefix + ".jspw.html"),

526  //                                      OUTPUT_ENCODING));

527         //---------------------------------------------------------

528 

529         // Make the YAM disk file

530         System.out.println("Translating " + jspwFileName);

531         PrintWriter yamOut = new PrintWriter(yamDiskFile, OUTPUT_ENCODING);

532         Reader reader = new InputStreamReader(

533               new FileInputStream(jspwFile), INPUT_ENCODING);

534 

535         yamOut.println(readerToStringWithTitle(reader, prefix));

536         yamOut.flush();

537 

538         // Make the YamFile from the disk file

539         YamFile yamFile = YamFile.get(

540                 new FileSystemResource(yamDiskFile.getCanonicalPath()));

541 

542         // Attachments. These are not listed in the JSPWiki text file, but need

543         // to be added to the end of the YAM.

544         processAttachments(jspwFile, yamDiskFile);

545 

546         //yamFile.generate();

547         yamFile.setContextPath(outDirName);

548         yamsToGenerate.add(yamFile);

549 

550       } catch(Exception e) {

551         e.printStackTrace();

552         errors.add(yamFileName + ": " + e.toString());

553       }

554 

555       // Generate - we do this once we have all yam files are present, to make

556       // sure linked files are present

557       for(YamFile yam : yamsToGenerate){

558         try{

559           yam.generate();

560         } catch(GateException ge){

561           ge.printStackTrace();

562           errors.add(yam + ": " + ge.toString());

563         }

564       }

565 

566     } // end of for

567 

568     // Report the errors

569     System.out.println("Translation finished with " + errors.size()

570             + " errors");

571     for(String error : errors) {

572       System.out.println(error);

573     }

574 

575   } // end of main

576 

577   /**

578    * Print the command line usage of th is class to standard out.

579    */

580   private static void printUsage() {

581     System.out.println("JSPWikiToYamConverter - convert JSPWiki files to YAM");

582     System.out.println("Usage:");

583     System.out.println("  JSPWikiToYamConverter (file|directory) [outputDir]");

584     System.out.println("    file:      JSPWiki file to translate");

585     System.out.println("    directory: directory of files to translate");

586     System.out.println("    outputDir: directory to write YAM files to");

587     System.out.println("               (defaults to current directory)");

588   }

589 

590 

591   

592 } //