001 /*
002 * YamFormatter.java Copyright (c) 1998-2008, The University of Sheffield.
003 *
004 * This code is from the GATE project (http://gate.ac.uk/) and is free software
005 * licenced under the GNU General Public License version 3. It is distributed
006 * without any warranty. For more details see COPYING.txt in the top level
007 * directory (or at http://gatewiki.sf.net/COPYING.txt).
008 */
009 package gate.yam.format;
010
011 import java.io.*;
012 import java.net.*;
013 import java.util.ArrayList;
014 import java.util.Arrays;
015 import java.util.Collections;
016 import java.util.HashMap;
017 import java.util.HashSet;
018 import java.util.List;
019 import java.util.Map;
020 import java.util.Set;
021
022 import org.apache.log4j.Logger;
023 import gate.*;
024 import gate.util.*;
025 import gate.util.persistence.PersistenceManager;
026
027 /**
028 * Format utility for YAM based on JAPE.
029 *
030 * @author Niraj Aswani
031 */
032 public class YamFormatter {
033
034 /** Logger */
035 static Logger log = Logger.getLogger("gate.yam.format.YamFormatter");
036
037 /**
038 * Spaces Table used for storing desired number of min/max/normal spaces
039 * before/after the edit annotations
040 */
041 static final Map<String, SpacesTable> spacesTable =
042 new HashMap<String, SpacesTable>();
043
044 // spaces initialisation
045 static {
046 // e.g.SpaceTable(minBefore,maxBefore,normBefore,minAfter,maxAfter,
047 // normAfter)
048 // <b >b =b <a >a =a
049 spacesTable.put("SectionHead", new SpacesTable(2, 3, 2, 1, 2, 1));
050 spacesTable.put("Contents", new SpacesTable(0, 1, 0, 1, 3, 1));
051 spacesTable.put("List", new SpacesTable(1, 2, 1, 1, 2, 1));
052 spacesTable.put("Paragraph", new SpacesTable(1, 2, 1, 1, 2, 1));
053 } // spaces initialisation
054
055 /**
056 * DocumentProcessor used to process the GATE document.
057 */
058 private static DocumentProcessor documentProcessor;
059
060 /**
061 * Yam Parser xgapp file name (relative to web-app/WEB-INF/gate) used if no
062 * explicit DocumentProcessor has been provided.
063 */
064 static final String YAM_PARSER_XGAPP = "/applications/yam-parser.xgapp";
065
066 /**
067 * Annotation type for the Wrap annotations
068 */
069 static final String WRAP_AT = "Wrap";
070
071 /**
072 * Annotation type for the Edit annotations.
073 */
074 static final String EDIT_AT = "Edit";
075
076 /**
077 * name of the feature to represent the number of lines before the Edit
078 * annotation
079 */
080 static final String NUM_BLANK_LINES_BEFORE_FEATURE_NAME =
081 "numBlankLinesBefore";
082
083 /**
084 * name of the feature to represent the number of lines after the Edit
085 * annotation
086 */
087 static final String NUM_BLANK_LINES_AFTER_FEATURE_NAME = "numBlankLinesAfter";
088
089 static public void setDocumentProcessor(DocumentProcessor p) {
090 log.info("Using custom DocumentProcessor: " + p);
091 documentProcessor = p;
092 }
093
094 /**
095 * Normalise the spacing in an annotated YAM document.
096 *
097 * @param doc
098 * a GATE doc that's annotated with all the YAM constituents
099 * @param content
100 * the content that the GATE doc was built from
101 * @return the StringBuilder is adapted
102 */
103 static public void normalise(Document doc, StringBuilder content) {
104 // step 1 - process document with the xgapp file
105 // the application will produce two types of annotations
106 // (look at the static variables WRAP_AT and EDIT_AT)
107
108 // commenting this out because yam-parser xgapp is not working yet
109 try {
110 if(documentProcessor == null) {
111 setupDefaultDocumentProcessor();
112 }
113 documentProcessor.processDocument(doc);
114 }
115 catch(RuntimeException e) {
116 throw e;
117 }
118 catch(Exception e) {
119 throw new RuntimeException(e);
120 }
121 // TODO
122 // formatDocument(doc, content);
123 } // normalise(Document)
124
125 /**
126 * Set up a default simple document processor if one has not been configured
127 * externally.
128 */
129 private static void setupDefaultDocumentProcessor() throws GateException,
130 IOException {
131 URL gappFileUrl = null;
132 try {
133 gappFileUrl =
134 new File(Gate.getGateHome(), YAM_PARSER_XGAPP).toURI().toURL();
135 } catch(MalformedURLException e) {
136 throw new RuntimeException("Bad URL for the " + YAM_PARSER_XGAPP
137 + " resource: " + gappFileUrl);
138 }
139 if(gappFileUrl == null)
140 throw new RuntimeException("Could not find the " + YAM_PARSER_XGAPP
141 + " resource");
142
143 // load the saved application
144 CorpusController application =
145 (CorpusController)PersistenceManager
146 .loadObjectFromUrl(gappFileUrl);
147 documentProcessor = new CorpusControllerDocumentProcessor();
148 ((CorpusControllerDocumentProcessor)documentProcessor).setController(
149 application);
150 }
151
152 /**
153 * This method formats the document for its white spaces - adding more where
154 * needed and deleting where are more than desired.
155 *
156 * @param doc
157 * @param content
158 */
159 private static void formatDocument(Document doc, StringBuilder content) {
160
161 // annotation types to process
162 final String EDIT_AT = "Edit";
163 final String WRAP_AT = "Wrap";
164
165 Set<String> annotationsToChange = new HashSet<String>();
166 annotationsToChange.add(EDIT_AT);
167 annotationsToChange.add(WRAP_AT);
168
169 // obtain annotation set
170 AnnotationSet inputAS =
171 doc.getAnnotations("YAM Constituents").get(annotationsToChange);
172
173 // sort them first
174 List<Annotation> annotationsToProcess = new ArrayList<Annotation>(inputAS);
175 Collections.sort(annotationsToProcess, new OffsetComparator());
176
177 // buffer to store formatted content
178 StringBuilder formattedBuffer = new StringBuilder();
179
180 // to keep track of the position to which the contents have been copied
181 // from the original document content
182 int contentCopiedTill = 0;
183
184 // position where the new lines (as part of Edit annotations formatting)
185 // should be added or removed from
186 int nlPosition = 0;
187
188 /*
189 *Keeps track of how many nls were added at the end of the last
190 *seen annotation.
191 */
192 int nlsAdded = 0;
193
194 /*
195 * Some Edit annotations have wrap annotations inside them.
196 * <Edit>
197 * <Wrap>- list item1 </Wrap>
198 * <Wrap>- list item2 </Wrap>
199 * <Wrap>- list item3 </Wrap>
200 * </Edit>
201 *
202 * When sorted, they appear in the following order
203 * Wrap, Edit, Wrap, Wrap
204 *
205 * New lines need to be added/removed before/after the Edit annotations
206 * based on the number of new lines are already there.
207 * In the above example, newlines should be added before the first Wrap
208 * annotation as well as after the last Wrap annotation. However the
209 * information about how many new lines are already there is in the Edit
210 * annotation. When we get to the last Wrap annotation we would need to
211 * have access to the Edit annotation.
212 *
213 * The following variable is useful for remembering the last seen Edit
214 * annotation's offsets. When we reach at the last Wrap annotation
215 * in the above sequence, whose endOffset is equal to the last seen Edit
216 * annotation's end offset, we add new lines.
217 */
218 Annotation lastEditAnnotation = null;
219
220 // processing one annotation at time
221 for(Annotation currentAnnotation : annotationsToProcess) {
222
223 // offsets
224 int startOffset = currentAnnotation.getStartNode().getOffset().intValue();
225 int endOffset = currentAnnotation.getEndNode().getOffset().intValue();
226
227 /*
228 * before we add any further content,
229 * check if we need to add new lines for the lastEditAnnotation.
230 * This happens when 1) we come across a wrap annotation that has same
231 * end offset as the end offset of the lastEditAnnotation (this condition
232 * is checked later when the Wrap annotation is processed) and
233 * 2) the current annotation is a new annotation that starts after the
234 * last seen Edit annotation.
235 */
236 if(lastEditAnnotation != null) {
237 int lastEditAnnotationEndOffset =
238 lastEditAnnotation.getEndNode().getOffset().intValue();
239
240 if(startOffset >= lastEditAnnotationEndOffset) {
241 nlsAdded = nlsAfterAnnotation(formattedBuffer, lastEditAnnotation,
242 formattedBuffer.length());
243 lastEditAnnotation = null;
244 }
245 }
246
247 // see if there's any content not annotated in the document (e.g. verbatim
248 // tables etc? copy the text over to the new string
249 int numCharsToCopy = startOffset - contentCopiedTill;
250 if(numCharsToCopy > 0) {
251 // copying from original content to the new content
252 formattedBuffer.append(
253 content.substring(contentCopiedTill, startOffset));
254
255 // contents copied till the startOffset of the current annotation
256 contentCopiedTill = startOffset;
257
258 // if need be, where to add new lines to or remove from
259 nlPosition = formattedBuffer.length();
260
261 // at the end of string copy, we don't have any nls added
262 nlsAdded = 0;
263 }
264
265 // if it is a wrap annotation
266 if(currentAnnotation.getType().equals(WRAP_AT)) {
267
268 // range to process
269 StringBuilder wrapRange =
270 new StringBuilder(content.substring(startOffset, endOffset));
271
272 // calculate indentLevel
273 int indentLevel = calculateIndentLevel(wrapRange);
274
275 // wrap the content
276 wrap(wrapRange, 0, wrapRange.length(), indentLevel, 79);
277
278 // if need be, where to add new lines
279 nlPosition = formattedBuffer.length();
280
281 // append the wrapped text
282 formattedBuffer.append(wrapRange);
283
284 // if the endoffset of the wrap annotation is same as the
285 // endoffset of the last edit annotation, this is where we
286 // add new lines
287 if(lastEditAnnotation != null) {
288 int lastEditAnnotationEndOffset =
289 lastEditAnnotation.getEndNode().getOffset().intValue();
290
291 // only if the current wrap annotation ends at the same place
292 // where the edit annotation ended
293 if(lastEditAnnotationEndOffset == endOffset) {
294 nlsAdded = nlsAfterAnnotation(formattedBuffer, lastEditAnnotation,
295 formattedBuffer.length());
296 lastEditAnnotation = null;
297 } else if(endOffset > lastEditAnnotationEndOffset) {
298 log.error("Overlapping " + WRAP_AT +
299 " annotation found at position "+startOffset);
300 /*throw new RuntimeException("Overlapping " + WRAP_AT +
301 " annotation found at position "+startOffset);*/
302 }
303 }
304
305 // update the contentCopiedTill variable
306 contentCopiedTill = endOffset;
307
308 } else if(currentAnnotation.getType().equals(EDIT_AT)) {
309
310 // add new lines before the annotation here
311 nlsBeforeAnnotation(formattedBuffer, currentAnnotation, nlPosition,
312 nlsAdded);
313
314 //reset nlsAdded
315 nlsAdded = 0;
316
317 // remember last seen edit annotation
318 lastEditAnnotation = currentAnnotation;
319 }
320
321 }// for
322
323 // finally if anything not copied into the output result?
324 if(contentCopiedTill < content.length()) {
325 formattedBuffer.append(content.substring(contentCopiedTill));
326 }
327
328 // replacing the original content with the new one
329 content.replace(0, content.length(), formattedBuffer.toString());
330
331 } // formatDocument(Document, StringBuilder)
332
333 /**
334 * This method counts the indentLevel by looking for white-spaces
335 * at the start of the first line.
336 *
337 * @param stOffset
338 * @param content
339 * @return
340 */
341 private static int calculateIndentLevel(StringBuilder content) {
342 int i = 0;
343 char ch = content.charAt(i);
344 int indentLevel = 0;
345 while(ch == ' ' || ch == '\t') {
346 if(ch == '\t') {
347 int tabLength = 8;
348 // assuming stOffset is the start of the line
349 tabLength -= (i) % 8;
350 indentLevel += tabLength;
351 } else {
352 indentLevel++;
353 }
354 i++;
355 ch = content.charAt(i);
356 }
357
358 boolean isListItem = ch == '-' || ch == '#';
359 // if listItem
360 if(isListItem) indentLevel += 2;
361 return indentLevel;
362 } // calculateIndentLevel(StringBuilder)
363
364 /**
365 * Removes new lines where more than desired and adds where too few
366 *
367 * @param content
368 * content of the document that should be formatted for new lines
369 * @param a
370 * Edit annotation which contains information about new lines
371 * before and after the annotations (as counted in the original
372 * document).
373 * @param insertAt
374 * position where the nls should be added or removed
375 */
376 private static int nlsAfterAnnotation(StringBuilder content, Annotation a,
377 int insertAt) {
378
379 // type of the edit annotation we're editing
380 String aType = (String)a.getFeatures().get("type");
381
382 // obtain the information about desired min/max/norm new lines for this
383 // annotation type
384 SpacesTable spaceTable = spacesTable.get(aType);
385
386 // if this happens, one should look at the spacesTable in the static
387 // block
388 if(spaceTable == null)
389 throw new RuntimeException("could not recognize " + EDIT_AT
390 + " with type " + aType);
391
392 // number of blank lines after the current annotation
393 int numBlankLinesAfter =
394 (Integer)a.getFeatures().get(NUM_BLANK_LINES_AFTER_FEATURE_NAME);
395 if(numBlankLinesAfter < spaceTable.getMinSpacesAfter()) {
396 int nlToAdd = spaceTable.getNormSpacesAfter() - numBlankLinesAfter;
397 if(insertNL(content, nlToAdd, insertAt)) {
398 return spaceTable.getNormSpacesAfter();
399 }
400 } else if(numBlankLinesAfter > spaceTable.getMaxSpacesAfter()) {
401 int nlToRemove = numBlankLinesAfter - spaceTable.getNormSpacesAfter();
402 if(removeNL(content, nlToRemove, insertAt, false)) {
403 return spaceTable.getNormSpacesAfter();
404 }
405 }
406
407 // how many newlines were there already after the current annotation
408 return numBlankLinesAfter;
409 } // nlsAfterAnnotation(StringBuilder, Annotation, int)
410
411 /**
412 * Removes new lines where more than desired and adds where too few
413 *
414 * @param content
415 * content of the document that should be formatted for new lines
416 * @param a
417 * Edit annotation which contains information about new lines
418 * before and after the annotations (as counted in the original
419 * document).
420 * @param insertAt
421 * position where the nls should be added or removed
422 * @param nlsToPreserve
423 * number of new lines must be preserved
424 */
425 private static void nlsBeforeAnnotation(StringBuilder content, Annotation a,
426 int insertAt, int nlsToPreserve) {
427
428 // type of the edit annotation we're editing
429 String aType = (String)a.getFeatures().get("type");
430
431 // obtain the information about desired min/max/norm new lines for this
432 // annotation type
433 SpacesTable spaceTable = spacesTable.get(aType);
434
435 // if this happes, one should look at the spacesTable in the static
436 // block
437 if(spaceTable == null)
438 throw new RuntimeException("could not recognize " + EDIT_AT
439 + " with type " + aType);
440
441 // number of blank lines before the current annotation
442 int numBlankLinesBefore =
443 (Integer)a.getFeatures().get(NUM_BLANK_LINES_BEFORE_FEATURE_NAME);
444
445 // these many nls are already there.
446 // nlsAlready there give us the number of spaces added as a result of
447 // previous Edit annotation
448 numBlankLinesBefore += nlsToPreserve;
449
450 // normal spaces for this annotation
451 int normSpacesBefore = spaceTable.getNormSpacesBefore();
452
453 // e.g. if <SectionHead> <Paragraph>
454 // while processing SectionHead, say we added/left 2 new lines after it;
455 // now while processing Paragraph, it says that there should be 1 nl before;
456 // we take the value whichever is the larger
457 if(nlsToPreserve > normSpacesBefore) {
458 normSpacesBefore = nlsToPreserve;
459 }
460
461 // nls before the current annotation should be edited only
462 // if it is not between the min and max desired spaces
463 if(numBlankLinesBefore < spaceTable.getMinSpacesBefore()) {
464 int nlToAdd = normSpacesBefore - numBlankLinesBefore;
465 insertNL(content, nlToAdd, insertAt-1);
466 } else if(numBlankLinesBefore > spaceTable.getMaxSpacesBefore()) {
467 int nlToRemove = numBlankLinesBefore - normSpacesBefore;
468 removeNL(content, nlToRemove, insertAt-1, true);
469 }
470
471 } // nlsBeforeAnnotation(StringBuilder, Annotation, int, int)
472
473 /**
474 * Removes new lines (represented by nlToRemove) at the specified position
475 * (removeAt)
476 *
477 * @param content
478 * @param nlToRemove
479 * @param removeAt
480 * @param backward
481 * indicates if the new lines to delete are in the backward direction
482 * @param true
483 * if this method carried out changes, false otherwise
484 */
485 private static boolean removeNL(StringBuilder content, int nlToRemove,
486 int removeAt, boolean backward) {
487 if(removeAt < 0 || removeAt >= content.length() || nlToRemove <= 0)
488 return false;
489
490 if(backward) {
491 // e.g. delete 2 new lines at position 10 in the backward direction
492 // 10 - 2 + 1 = 9 => new line will be deleted at position 9, which
493 // will bring position 10 to position 9 and the character at the position
494 // 9 will be deleted again
495 removeAt = removeAt - nlToRemove + 1;
496 }
497
498 for(int i = 0; i < nlToRemove; i++) {
499 content.deleteCharAt(removeAt);
500 }
501
502 return true;
503 } // removeNL(StringBuffer, int, int, boolean)
504
505 /**
506 * Inserts new lines (represented by nlToAdd) at the specified position
507 * (insertAt)
508 *
509 * @param content
510 * @param nlToAdd
511 * @param insertAt
512 * @return true if this method carried out changes, false otherwise
513 */
514 private static boolean insertNL(StringBuilder content, int nlToAdd,
515 int insertAt) {
516 if(insertAt < 0 || nlToAdd <= 0) return false;
517
518 // new lines to fill in
519 char[] nls = new char[nlToAdd];
520 Arrays.fill(nls, '\n');
521
522 if(insertAt < content.length()) {
523 content.insert(insertAt, nls);
524 return true;
525 } else if(insertAt == content.length()) {
526 content.append(nls);
527 return true;
528 }
529 return false;
530 } // insertNL(StringBuilder, int, int)
531
532 /**
533 * Wrap a range of text.
534 *
535 * @param inputText
536 * buffer that contains a range to modify
537 * @param startOffset
538 * start offset of the range to wrap
539 * @param endOffset
540 * end offset of the range to wrap
541 * @param indentLevel
542 * left margin for the range
543 * @param textWidth
544 * wrap margin
545 */
546 static public void wrap(StringBuilder inputText, int startOffset,
547 int endOffset, int indentLevel, int textWidth) {
548
549 // indent text
550 textWidth -= indentLevel;
551 char[] spaces = new char[indentLevel];
552 Arrays.fill(spaces, ' ');
553 String indent = new String(spaces);
554
555 // local buffer to apply changes to
556 StringBuffer buf =
557 new StringBuffer(inputText.substring(startOffset, endOffset));
558
559 // keeping track of last position of the space
560 int lastSpacePos = -1;
561
562 // current character position
563 int i = 0;
564
565 // current line's length in number of characters
566 int lineLen = 0;
567
568 // if the white spaces should be deleted (e.g. whitespaces at the start
569 // of
570 // a new line should be deleted. But spaces within two words on the same
571 // line should be kept)
572 boolean removeSpaces = false;
573
574 // one character at a time
575 while(i < buf.length()) {
576
577 // character at position i
578 char c = buf.charAt(i);
579
580 // if white space
581 if(c == ' ') {
582 if(removeSpaces) {
583 buf.deleteCharAt(i);
584 continue;
585 }
586 lastSpacePos = i; // remember this
587 }
588
589 // if tab
590 if(c == '\t') {
591 if(removeSpaces) {
592 buf.deleteCharAt(i);
593 continue;
594 }
595
596 // find out the spaces that
597 // should be added from the current position
598 int tabLength = 8;
599 tabLength -= ((lineLen + indentLevel) % tabLength);
600 lastSpacePos = i;
601 lineLen += tabLength - 1; // considering global lineLen++
602 }
603
604 // no more spaces to delete
605 removeSpaces = false;
606
607 // if it is a new line, replace with ' '
608 if(c == '\n') {
609 buf.setCharAt(i, ' ');
610 lastSpacePos = i;
611 removeSpaces = true;
612 }
613
614 // reached end of the margin
615 if(lineLen > textWidth - 1 && lastSpacePos != -1) {
616 buf.setCharAt(lastSpacePos, '\n');
617 lineLen = i - lastSpacePos + 1;
618 lastSpacePos = -1;
619 }
620 lineLen++;
621 i++;
622 }
623
624 // new wrapped text with indentation
625 for(int j = 0; j < buf.length() - 1; j++) {
626 char c = buf.charAt(j);
627 if(c == '\n') {
628 buf.insert(j + 1, indent);
629 }
630 }
631
632 // replacing the original content with wrapped content
633 inputText.replace(startOffset, endOffset, buf.toString());
634 } // wrap(StringBuilder, int, int, int, int)
635 } // YamFormatter
|