Patch from Shaun Kalley from bug #56023 - Allow XSSF event model to find + return comments, and use this for the event based .xlsx text extractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1613266 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2014-07-24 20:13:54 +00:00
parent a41a4fd118
commit 14d5fa912f
4 changed files with 252 additions and 23 deletions

View File

@ -16,13 +16,22 @@
==================================================================== */
package org.apache.poi.xssf.eventusermodel;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.apache.poi.xssf.model.CommentsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTComment;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@ -54,6 +63,15 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
*/
private StylesTable stylesTable;
/**
* Table with cell comments
*/
private CommentsTable commentsTable;
/**
* Read only access to the shared strings table, for looking
* up (most) string cell's contents
*/
private ReadOnlySharedStringsTable sharedStringsTable;
/**
@ -78,6 +96,7 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
private short formatIndex;
private String formatString;
private final DataFormatter formatter;
private int rowNum;
private String cellRef;
private boolean formulasNotResults;
@ -86,6 +105,31 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
private StringBuffer formula = new StringBuffer();
private StringBuffer headerFooter = new StringBuffer();
private Queue<CellReference> commentCellRefs;
/**
* Accepts objects needed while parsing.
*
* @param styles Table of styles
* @param strings Table of shared strings
*/
public XSSFSheetXMLHandler(
StylesTable styles,
CommentsTable comments,
ReadOnlySharedStringsTable strings,
SheetContentsHandler sheetContentsHandler,
DataFormatter dataFormatter,
boolean formulasNotResults) {
this.stylesTable = styles;
this.commentsTable = comments;
this.sharedStringsTable = strings;
this.output = sheetContentsHandler;
this.formulasNotResults = formulasNotResults;
this.nextDataType = xssfDataType.NUMBER;
this.formatter = dataFormatter;
init();
}
/**
* Accepts objects needed while parsing.
*
@ -98,13 +142,9 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
SheetContentsHandler sheetContentsHandler,
DataFormatter dataFormatter,
boolean formulasNotResults) {
this.stylesTable = styles;
this.sharedStringsTable = strings;
this.output = sheetContentsHandler;
this.formulasNotResults = formulasNotResults;
this.nextDataType = xssfDataType.NUMBER;
this.formatter = dataFormatter;
this(styles, null, strings, sheetContentsHandler, dataFormatter, formulasNotResults);
}
/**
* Accepts objects needed while parsing.
*
@ -118,6 +158,16 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
boolean formulasNotResults) {
this(styles, strings, sheetContentsHandler, new DataFormatter(), formulasNotResults);
}
private void init() {
if (commentsTable != null) {
commentCellRefs = new LinkedList<CellReference>();
List<CTComment> commentList = commentsTable.getCTComments().getCommentList().getCommentList();
for (CTComment comment : commentList) {
commentCellRefs.add(new CellReference(comment.getRef()));
}
}
}
private boolean isTextTag(String name) {
if("v".equals(name)) {
@ -190,7 +240,7 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
headerFooter.setLength(0);
}
else if("row".equals(name)) {
int rowNum = Integer.parseInt(attributes.getValue("r")) - 1;
rowNum = Integer.parseInt(attributes.getValue("r")) - 1;
output.startRow(rowNum);
}
// c => cell
@ -304,14 +354,25 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
break;
}
// Do we have a comment for this cell?
checkForEmptyCellComments(EmptyCellCommentsCheckType.CELL);
XSSFComment comment = commentsTable != null ? commentsTable.findCellComment(cellRef) : null;
// Output
output.cell(cellRef, thisStr);
output.cell(cellRef, thisStr, comment);
} else if ("f".equals(name)) {
fIsOpen = false;
} else if ("is".equals(name)) {
isIsOpen = false;
} else if ("row".equals(name)) {
output.endRow();
// Handle any "missing" cells which had comments attached
checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_ROW);
// Finish up the row
output.endRow(rowNum);
} else if ("sheetData".equals(name)) {
// Handle any "missing" cells which had comments attached
checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_SHEET_DATA);
}
else if("oddHeader".equals(name) || "evenHeader".equals(name) ||
"firstHeader".equals(name)) {
@ -342,6 +403,90 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
headerFooter.append(ch, start, length);
}
}
/**
* Do a check for, and output, comments in otherwise empty cells.
*/
private void checkForEmptyCellComments(EmptyCellCommentsCheckType type) {
if (commentCellRefs != null && !commentCellRefs.isEmpty()) {
// If we've reached the end of the sheet data, output any
// comments we haven't yet already handled
if (type == EmptyCellCommentsCheckType.END_OF_SHEET_DATA) {
while (!commentCellRefs.isEmpty()) {
outputEmptyCellComment(commentCellRefs.remove());
}
return;
}
// At the end of a row, handle any comments for "missing" rows before us
if (this.cellRef == null) {
if (type == EmptyCellCommentsCheckType.END_OF_ROW) {
while (!commentCellRefs.isEmpty()) {
if (commentCellRefs.peek().getRow() == rowNum) {
outputEmptyCellComment(commentCellRefs.remove());
} else {
return;
}
}
return;
} else {
throw new IllegalStateException("Cell ref should be null only if there are only empty cells in the row; rowNum: " + rowNum);
}
}
CellReference nextCommentCellRef;
do {
CellReference cellRef = new CellReference(this.cellRef);
CellReference peekCellRef = commentCellRefs.peek();
if (type == EmptyCellCommentsCheckType.CELL && cellRef.equals(peekCellRef)) {
// remove the comment cell ref from the list if we're about to handle it alongside the cell content
commentCellRefs.remove();
return;
} else {
// fill in any gaps if there are empty cells with comment mixed in with non-empty cells
int comparison = cellRefComparator.compare(peekCellRef, cellRef);
if (comparison > 0 && type == EmptyCellCommentsCheckType.END_OF_ROW && peekCellRef.getRow() <= rowNum) {
nextCommentCellRef = commentCellRefs.remove();
outputEmptyCellComment(nextCommentCellRef);
} else if (comparison < 0 && type == EmptyCellCommentsCheckType.CELL && peekCellRef.getRow() <= rowNum) {
nextCommentCellRef = commentCellRefs.remove();
outputEmptyCellComment(nextCommentCellRef);
} else {
nextCommentCellRef = null;
}
}
} while (nextCommentCellRef != null && !commentCellRefs.isEmpty());
}
}
/**
* Output an empty-cell comment.
*/
private void outputEmptyCellComment(CellReference cellRef) {
String cellRefString = cellRef.formatAsString();
XSSFComment comment = commentsTable.findCellComment(cellRefString);
output.emptyCellComment(cellRefString, comment);
}
private enum EmptyCellCommentsCheckType {
CELL,
END_OF_ROW,
END_OF_SHEET_DATA
}
private static final Comparator<CellReference> cellRefComparator = new Comparator<CellReference>() {
@Override
public int compare(CellReference o1, CellReference o2) {
int result = compare(o1.getRow(), o2.getRow());
if (result == 0) {
result = compare(o1.getCol(), o2.getCol());
}
return result;
}
public int compare(int x, int y) {
return (x < y) ? -1 : ((x == y) ? 0 : 1);
}
};
/**
* You need to implement this to handle the results
@ -351,9 +496,11 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
/** A row with the (zero based) row number has started */
public void startRow(int rowNum);
/** A row with the (zero based) row number has ended */
public void endRow();
/** A cell, with the given formatted value, was encountered */
public void cell(String cellReference, String formattedValue);
public void endRow(int rowNum);
/** A cell, with the given formatted value, and possibly a comment, was encountered */
public void cell(String cellReference, String formattedValue, XSSFComment comment);
/** A comment for an otherwise-empty cell was encountered */
public void emptyCellComment(String cellReference, XSSFComment comment);
/** A header or footer has been encountered */
public void headerFooter(String text, boolean isHeader, String tagName);
}

View File

@ -39,7 +39,9 @@ import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.model.CommentsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.xmlbeans.XmlException;
@ -60,6 +62,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
private Locale locale;
private boolean includeTextBoxes = true;
private boolean includeSheetNames = true;
private boolean includeCellComments = false;
private boolean includeHeadersFooters = true;
private boolean formulasNotResults = false;
@ -112,11 +115,10 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
}
/**
* Would control the inclusion of cell comments from the document,
* if we supported it
* Should cell comments be included? Default is false
*/
public void setIncludeCellComments(boolean includeCellComments) {
throw new IllegalStateException("Comment extraction not supported in streaming mode, please use XSSFExcelExtractor");
this.includeCellComments = includeCellComments;
}
public void setLocale(Locale locale) {
@ -159,6 +161,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
public void processSheet(
SheetContentsHandler sheetContentsExtractor,
StylesTable styles,
CommentsTable comments,
ReadOnlySharedStringsTable strings,
InputStream sheetInputStream)
throws IOException, SAXException {
@ -176,7 +179,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
SAXParser saxParser = saxFactory.newSAXParser();
XMLReader sheetParser = saxParser.getXMLReader();
ContentHandler handler = new XSSFSheetXMLHandler(
styles, strings, sheetContentsExtractor, formatter, formulasNotResults);
styles, comments, strings, sheetContentsExtractor, formatter, formulasNotResults);
sheetParser.setContentHandler(handler);
sheetParser.parse(sheetSource);
} catch(ParserConfigurationException e) {
@ -203,7 +206,8 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
text.append(iter.getSheetName());
text.append('\n');
}
processSheet(sheetExtractor, styles, strings, stream);
CommentsTable comments = includeCellComments ? iter.getSheetComments() : null;
processSheet(sheetExtractor, styles, comments, strings, stream);
if (includeHeadersFooters) {
sheetExtractor.appendHeaderText(text);
}
@ -268,17 +272,32 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
firstCellOfRow = true;
}
public void endRow() {
public void endRow(int rowNum) {
output.append('\n');
}
public void cell(String cellRef, String formattedValue) {
public void cell(String cellRef, String formattedValue, XSSFComment comment) {
if(firstCellOfRow) {
firstCellOfRow = false;
} else {
output.append('\t');
}
output.append(formattedValue);
if (formattedValue != null) {
output.append(formattedValue);
}
if (includeCellComments && comment != null) {
String commentText = comment.getString().getString().replace('\n', ' ');
output.append(formattedValue != null ? " Comment by " : "Comment by ");
if (commentText.startsWith(comment.getAuthor() + ": ")) {
output.append(commentText);
} else {
output.append(comment.getAuthor()).append(": ").append(commentText);
}
}
}
public void emptyCellComment(String cellRef, XSSFComment comment) {
cell(cellRef, null, comment);
}
public void headerFooter(String text, boolean isHeader, String tagName) {
@ -287,7 +306,6 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
}
}
/**
* Append the text for the named header or footer if found.
*/

View File

@ -20,13 +20,13 @@ package org.apache.poi.xssf.extractor;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import junit.framework.TestCase;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.xssf.XSSFTestDataSamples;
import junit.framework.TestCase;
/**
* Tests for {@link XSSFEventBasedExcelExtractor}
*/
@ -240,4 +240,68 @@ public class TestXSSFEventBasedExcelExtractor extends TestCase {
fixture.setIncludeHeadersFooters(false);
assertEquals(expectedOutputWithoutHeadersAndFooters, fixture.getText());
}
/**
* Test that XSSFEventBasedExcelExtractor outputs comments when specified.
* The output will contain two improvements over the output from
* XSSFExcelExtractor in that (1) comments from empty cells will be
* outputted, and (2) the author will not be outputted twice.
* <p>
* This test will need to be modified if these improvements are ported to
* XSSFExcelExtractor.
*/
public void testCommentsComparedToNonEventBasedExtractor()
throws Exception {
String expectedOutputWithoutComments =
"Sheet1\n" +
"\n" +
"abc\n" +
"\n" +
"123\n" +
"\n" +
"\n" +
"\n";
String nonEventBasedExtractorOutputWithComments =
"Sheet1\n" +
"\n" +
"abc Comment by Shaun Kalley: Shaun Kalley: Comment A2\n" +
"\n" +
"123 Comment by Shaun Kalley: Shaun Kalley: Comment B4\n" +
"\n" +
"\n" +
"\n";
String eventBasedExtractorOutputWithComments =
"Sheet1\n" +
"Comment by Shaun Kalley: Comment A1\tComment by Shaun Kalley: Comment B1\n" +
"abc Comment by Shaun Kalley: Comment A2\tComment by Shaun Kalley: Comment B2\n" +
"Comment by Shaun Kalley: Comment A3\tComment by Shaun Kalley: Comment B3\n" +
"Comment by Shaun Kalley: Comment A4\t123 Comment by Shaun Kalley: Comment B4\n" +
"Comment by Shaun Kalley: Comment A5\tComment by Shaun Kalley: Comment B5\n" +
"Comment by Shaun Kalley: Comment A7\tComment by Shaun Kalley: Comment B7\n" +
"Comment by Shaun Kalley: Comment A8\tComment by Shaun Kalley: Comment B8\n";
XSSFExcelExtractor extractor = new XSSFExcelExtractor(
XSSFTestDataSamples.openSampleWorkbook("commentTest.xlsx"));
try {
assertEquals(expectedOutputWithoutComments, extractor.getText());
extractor.setIncludeCellComments(true);
assertEquals(nonEventBasedExtractorOutputWithComments, extractor.getText());
} finally {
extractor.close();
}
XSSFEventBasedExcelExtractor fixture =
new XSSFEventBasedExcelExtractor(
XSSFTestDataSamples.openSamplePackage("commentTest.xlsx"));
try {
assertEquals(expectedOutputWithoutComments, fixture.getText());
fixture.setIncludeCellComments(true);
assertEquals(eventBasedExtractorOutputWithComments, fixture.getText());
} finally {
fixture.close();
}
}
}

Binary file not shown.