Patch from Shaun Kalley from bug #56023 - Allow XSSF event model to find + return comments, and use this for the event based .xlsx text extractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1613266 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a41a4fd118
commit
14d5fa912f
@ -16,13 +16,22 @@
|
||||
==================================================================== */
|
||||
package org.apache.poi.xssf.eventusermodel;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
|
||||
import org.apache.poi.ss.usermodel.BuiltinFormats;
|
||||
import org.apache.poi.ss.usermodel.DataFormatter;
|
||||
import org.apache.poi.ss.util.CellReference;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
import org.apache.poi.xssf.model.CommentsTable;
|
||||
import org.apache.poi.xssf.model.StylesTable;
|
||||
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
|
||||
import org.apache.poi.xssf.usermodel.XSSFComment;
|
||||
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
|
||||
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTComment;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
@ -54,6 +63,15 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
|
||||
*/
|
||||
private StylesTable stylesTable;
|
||||
|
||||
/**
|
||||
* Table with cell comments
|
||||
*/
|
||||
private CommentsTable commentsTable;
|
||||
|
||||
/**
|
||||
* Read only access to the shared strings table, for looking
|
||||
* up (most) string cell's contents
|
||||
*/
|
||||
private ReadOnlySharedStringsTable sharedStringsTable;
|
||||
|
||||
/**
|
||||
@ -78,6 +96,7 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
|
||||
private short formatIndex;
|
||||
private String formatString;
|
||||
private final DataFormatter formatter;
|
||||
private int rowNum;
|
||||
private String cellRef;
|
||||
private boolean formulasNotResults;
|
||||
|
||||
@ -86,6 +105,31 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
|
||||
private StringBuffer formula = new StringBuffer();
|
||||
private StringBuffer headerFooter = new StringBuffer();
|
||||
|
||||
private Queue<CellReference> commentCellRefs;
|
||||
|
||||
/**
|
||||
* Accepts objects needed while parsing.
|
||||
*
|
||||
* @param styles Table of styles
|
||||
* @param strings Table of shared strings
|
||||
*/
|
||||
public XSSFSheetXMLHandler(
|
||||
StylesTable styles,
|
||||
CommentsTable comments,
|
||||
ReadOnlySharedStringsTable strings,
|
||||
SheetContentsHandler sheetContentsHandler,
|
||||
DataFormatter dataFormatter,
|
||||
boolean formulasNotResults) {
|
||||
this.stylesTable = styles;
|
||||
this.commentsTable = comments;
|
||||
this.sharedStringsTable = strings;
|
||||
this.output = sheetContentsHandler;
|
||||
this.formulasNotResults = formulasNotResults;
|
||||
this.nextDataType = xssfDataType.NUMBER;
|
||||
this.formatter = dataFormatter;
|
||||
init();
|
||||
}
|
||||
|
||||
/**
|
||||
* Accepts objects needed while parsing.
|
||||
*
|
||||
@ -98,13 +142,9 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
|
||||
SheetContentsHandler sheetContentsHandler,
|
||||
DataFormatter dataFormatter,
|
||||
boolean formulasNotResults) {
|
||||
this.stylesTable = styles;
|
||||
this.sharedStringsTable = strings;
|
||||
this.output = sheetContentsHandler;
|
||||
this.formulasNotResults = formulasNotResults;
|
||||
this.nextDataType = xssfDataType.NUMBER;
|
||||
this.formatter = dataFormatter;
|
||||
this(styles, null, strings, sheetContentsHandler, dataFormatter, formulasNotResults);
|
||||
}
|
||||
|
||||
/**
|
||||
* Accepts objects needed while parsing.
|
||||
*
|
||||
@ -118,6 +158,16 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
|
||||
boolean formulasNotResults) {
|
||||
this(styles, strings, sheetContentsHandler, new DataFormatter(), formulasNotResults);
|
||||
}
|
||||
|
||||
private void init() {
|
||||
if (commentsTable != null) {
|
||||
commentCellRefs = new LinkedList<CellReference>();
|
||||
List<CTComment> commentList = commentsTable.getCTComments().getCommentList().getCommentList();
|
||||
for (CTComment comment : commentList) {
|
||||
commentCellRefs.add(new CellReference(comment.getRef()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isTextTag(String name) {
|
||||
if("v".equals(name)) {
|
||||
@ -190,7 +240,7 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
|
||||
headerFooter.setLength(0);
|
||||
}
|
||||
else if("row".equals(name)) {
|
||||
int rowNum = Integer.parseInt(attributes.getValue("r")) - 1;
|
||||
rowNum = Integer.parseInt(attributes.getValue("r")) - 1;
|
||||
output.startRow(rowNum);
|
||||
}
|
||||
// c => cell
|
||||
@ -304,14 +354,25 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
|
||||
break;
|
||||
}
|
||||
|
||||
// Do we have a comment for this cell?
|
||||
checkForEmptyCellComments(EmptyCellCommentsCheckType.CELL);
|
||||
XSSFComment comment = commentsTable != null ? commentsTable.findCellComment(cellRef) : null;
|
||||
|
||||
// Output
|
||||
output.cell(cellRef, thisStr);
|
||||
output.cell(cellRef, thisStr, comment);
|
||||
} else if ("f".equals(name)) {
|
||||
fIsOpen = false;
|
||||
} else if ("is".equals(name)) {
|
||||
isIsOpen = false;
|
||||
} else if ("row".equals(name)) {
|
||||
output.endRow();
|
||||
// Handle any "missing" cells which had comments attached
|
||||
checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_ROW);
|
||||
|
||||
// Finish up the row
|
||||
output.endRow(rowNum);
|
||||
} else if ("sheetData".equals(name)) {
|
||||
// Handle any "missing" cells which had comments attached
|
||||
checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_SHEET_DATA);
|
||||
}
|
||||
else if("oddHeader".equals(name) || "evenHeader".equals(name) ||
|
||||
"firstHeader".equals(name)) {
|
||||
@ -342,6 +403,90 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
|
||||
headerFooter.append(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Do a check for, and output, comments in otherwise empty cells.
|
||||
*/
|
||||
private void checkForEmptyCellComments(EmptyCellCommentsCheckType type) {
|
||||
if (commentCellRefs != null && !commentCellRefs.isEmpty()) {
|
||||
// If we've reached the end of the sheet data, output any
|
||||
// comments we haven't yet already handled
|
||||
if (type == EmptyCellCommentsCheckType.END_OF_SHEET_DATA) {
|
||||
while (!commentCellRefs.isEmpty()) {
|
||||
outputEmptyCellComment(commentCellRefs.remove());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// At the end of a row, handle any comments for "missing" rows before us
|
||||
if (this.cellRef == null) {
|
||||
if (type == EmptyCellCommentsCheckType.END_OF_ROW) {
|
||||
while (!commentCellRefs.isEmpty()) {
|
||||
if (commentCellRefs.peek().getRow() == rowNum) {
|
||||
outputEmptyCellComment(commentCellRefs.remove());
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
return;
|
||||
} else {
|
||||
throw new IllegalStateException("Cell ref should be null only if there are only empty cells in the row; rowNum: " + rowNum);
|
||||
}
|
||||
}
|
||||
|
||||
CellReference nextCommentCellRef;
|
||||
do {
|
||||
CellReference cellRef = new CellReference(this.cellRef);
|
||||
CellReference peekCellRef = commentCellRefs.peek();
|
||||
if (type == EmptyCellCommentsCheckType.CELL && cellRef.equals(peekCellRef)) {
|
||||
// remove the comment cell ref from the list if we're about to handle it alongside the cell content
|
||||
commentCellRefs.remove();
|
||||
return;
|
||||
} else {
|
||||
// fill in any gaps if there are empty cells with comment mixed in with non-empty cells
|
||||
int comparison = cellRefComparator.compare(peekCellRef, cellRef);
|
||||
if (comparison > 0 && type == EmptyCellCommentsCheckType.END_OF_ROW && peekCellRef.getRow() <= rowNum) {
|
||||
nextCommentCellRef = commentCellRefs.remove();
|
||||
outputEmptyCellComment(nextCommentCellRef);
|
||||
} else if (comparison < 0 && type == EmptyCellCommentsCheckType.CELL && peekCellRef.getRow() <= rowNum) {
|
||||
nextCommentCellRef = commentCellRefs.remove();
|
||||
outputEmptyCellComment(nextCommentCellRef);
|
||||
} else {
|
||||
nextCommentCellRef = null;
|
||||
}
|
||||
}
|
||||
} while (nextCommentCellRef != null && !commentCellRefs.isEmpty());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Output an empty-cell comment.
|
||||
*/
|
||||
private void outputEmptyCellComment(CellReference cellRef) {
|
||||
String cellRefString = cellRef.formatAsString();
|
||||
XSSFComment comment = commentsTable.findCellComment(cellRefString);
|
||||
output.emptyCellComment(cellRefString, comment);
|
||||
}
|
||||
|
||||
private enum EmptyCellCommentsCheckType {
|
||||
CELL,
|
||||
END_OF_ROW,
|
||||
END_OF_SHEET_DATA
|
||||
}
|
||||
private static final Comparator<CellReference> cellRefComparator = new Comparator<CellReference>() {
|
||||
@Override
|
||||
public int compare(CellReference o1, CellReference o2) {
|
||||
int result = compare(o1.getRow(), o2.getRow());
|
||||
if (result == 0) {
|
||||
result = compare(o1.getCol(), o2.getCol());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
public int compare(int x, int y) {
|
||||
return (x < y) ? -1 : ((x == y) ? 0 : 1);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* You need to implement this to handle the results
|
||||
@ -351,9 +496,11 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
|
||||
/** A row with the (zero based) row number has started */
|
||||
public void startRow(int rowNum);
|
||||
/** A row with the (zero based) row number has ended */
|
||||
public void endRow();
|
||||
/** A cell, with the given formatted value, was encountered */
|
||||
public void cell(String cellReference, String formattedValue);
|
||||
public void endRow(int rowNum);
|
||||
/** A cell, with the given formatted value, and possibly a comment, was encountered */
|
||||
public void cell(String cellReference, String formattedValue, XSSFComment comment);
|
||||
/** A comment for an otherwise-empty cell was encountered */
|
||||
public void emptyCellComment(String cellReference, XSSFComment comment);
|
||||
/** A header or footer has been encountered */
|
||||
public void headerFooter(String text, boolean isHeader, String tagName);
|
||||
}
|
||||
|
@ -39,7 +39,9 @@ import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
|
||||
import org.apache.poi.xssf.eventusermodel.XSSFReader;
|
||||
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
|
||||
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
|
||||
import org.apache.poi.xssf.model.CommentsTable;
|
||||
import org.apache.poi.xssf.model.StylesTable;
|
||||
import org.apache.poi.xssf.usermodel.XSSFComment;
|
||||
import org.apache.poi.xssf.usermodel.XSSFShape;
|
||||
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
@ -60,6 +62,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
||||
private Locale locale;
|
||||
private boolean includeTextBoxes = true;
|
||||
private boolean includeSheetNames = true;
|
||||
private boolean includeCellComments = false;
|
||||
private boolean includeHeadersFooters = true;
|
||||
private boolean formulasNotResults = false;
|
||||
|
||||
@ -112,11 +115,10 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
||||
}
|
||||
|
||||
/**
|
||||
* Would control the inclusion of cell comments from the document,
|
||||
* if we supported it
|
||||
* Should cell comments be included? Default is false
|
||||
*/
|
||||
public void setIncludeCellComments(boolean includeCellComments) {
|
||||
throw new IllegalStateException("Comment extraction not supported in streaming mode, please use XSSFExcelExtractor");
|
||||
this.includeCellComments = includeCellComments;
|
||||
}
|
||||
|
||||
public void setLocale(Locale locale) {
|
||||
@ -159,6 +161,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
||||
public void processSheet(
|
||||
SheetContentsHandler sheetContentsExtractor,
|
||||
StylesTable styles,
|
||||
CommentsTable comments,
|
||||
ReadOnlySharedStringsTable strings,
|
||||
InputStream sheetInputStream)
|
||||
throws IOException, SAXException {
|
||||
@ -176,7 +179,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
||||
SAXParser saxParser = saxFactory.newSAXParser();
|
||||
XMLReader sheetParser = saxParser.getXMLReader();
|
||||
ContentHandler handler = new XSSFSheetXMLHandler(
|
||||
styles, strings, sheetContentsExtractor, formatter, formulasNotResults);
|
||||
styles, comments, strings, sheetContentsExtractor, formatter, formulasNotResults);
|
||||
sheetParser.setContentHandler(handler);
|
||||
sheetParser.parse(sheetSource);
|
||||
} catch(ParserConfigurationException e) {
|
||||
@ -203,7 +206,8 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
||||
text.append(iter.getSheetName());
|
||||
text.append('\n');
|
||||
}
|
||||
processSheet(sheetExtractor, styles, strings, stream);
|
||||
CommentsTable comments = includeCellComments ? iter.getSheetComments() : null;
|
||||
processSheet(sheetExtractor, styles, comments, strings, stream);
|
||||
if (includeHeadersFooters) {
|
||||
sheetExtractor.appendHeaderText(text);
|
||||
}
|
||||
@ -268,17 +272,32 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
||||
firstCellOfRow = true;
|
||||
}
|
||||
|
||||
public void endRow() {
|
||||
public void endRow(int rowNum) {
|
||||
output.append('\n');
|
||||
}
|
||||
|
||||
public void cell(String cellRef, String formattedValue) {
|
||||
public void cell(String cellRef, String formattedValue, XSSFComment comment) {
|
||||
if(firstCellOfRow) {
|
||||
firstCellOfRow = false;
|
||||
} else {
|
||||
output.append('\t');
|
||||
}
|
||||
output.append(formattedValue);
|
||||
if (formattedValue != null) {
|
||||
output.append(formattedValue);
|
||||
}
|
||||
if (includeCellComments && comment != null) {
|
||||
String commentText = comment.getString().getString().replace('\n', ' ');
|
||||
output.append(formattedValue != null ? " Comment by " : "Comment by ");
|
||||
if (commentText.startsWith(comment.getAuthor() + ": ")) {
|
||||
output.append(commentText);
|
||||
} else {
|
||||
output.append(comment.getAuthor()).append(": ").append(commentText);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void emptyCellComment(String cellRef, XSSFComment comment) {
|
||||
cell(cellRef, null, comment);
|
||||
}
|
||||
|
||||
public void headerFooter(String text, boolean isHeader, String tagName) {
|
||||
@ -287,7 +306,6 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Append the text for the named header or footer if found.
|
||||
*/
|
||||
|
@ -20,13 +20,13 @@ package org.apache.poi.xssf.extractor;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.poi.POITextExtractor;
|
||||
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.xssf.XSSFTestDataSamples;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* Tests for {@link XSSFEventBasedExcelExtractor}
|
||||
*/
|
||||
@ -240,4 +240,68 @@ public class TestXSSFEventBasedExcelExtractor extends TestCase {
|
||||
fixture.setIncludeHeadersFooters(false);
|
||||
assertEquals(expectedOutputWithoutHeadersAndFooters, fixture.getText());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that XSSFEventBasedExcelExtractor outputs comments when specified.
|
||||
* The output will contain two improvements over the output from
|
||||
* XSSFExcelExtractor in that (1) comments from empty cells will be
|
||||
* outputted, and (2) the author will not be outputted twice.
|
||||
* <p>
|
||||
* This test will need to be modified if these improvements are ported to
|
||||
* XSSFExcelExtractor.
|
||||
*/
|
||||
public void testCommentsComparedToNonEventBasedExtractor()
|
||||
throws Exception {
|
||||
|
||||
String expectedOutputWithoutComments =
|
||||
"Sheet1\n" +
|
||||
"\n" +
|
||||
"abc\n" +
|
||||
"\n" +
|
||||
"123\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"\n";
|
||||
|
||||
String nonEventBasedExtractorOutputWithComments =
|
||||
"Sheet1\n" +
|
||||
"\n" +
|
||||
"abc Comment by Shaun Kalley: Shaun Kalley: Comment A2\n" +
|
||||
"\n" +
|
||||
"123 Comment by Shaun Kalley: Shaun Kalley: Comment B4\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"\n";
|
||||
|
||||
String eventBasedExtractorOutputWithComments =
|
||||
"Sheet1\n" +
|
||||
"Comment by Shaun Kalley: Comment A1\tComment by Shaun Kalley: Comment B1\n" +
|
||||
"abc Comment by Shaun Kalley: Comment A2\tComment by Shaun Kalley: Comment B2\n" +
|
||||
"Comment by Shaun Kalley: Comment A3\tComment by Shaun Kalley: Comment B3\n" +
|
||||
"Comment by Shaun Kalley: Comment A4\t123 Comment by Shaun Kalley: Comment B4\n" +
|
||||
"Comment by Shaun Kalley: Comment A5\tComment by Shaun Kalley: Comment B5\n" +
|
||||
"Comment by Shaun Kalley: Comment A7\tComment by Shaun Kalley: Comment B7\n" +
|
||||
"Comment by Shaun Kalley: Comment A8\tComment by Shaun Kalley: Comment B8\n";
|
||||
|
||||
XSSFExcelExtractor extractor = new XSSFExcelExtractor(
|
||||
XSSFTestDataSamples.openSampleWorkbook("commentTest.xlsx"));
|
||||
try {
|
||||
assertEquals(expectedOutputWithoutComments, extractor.getText());
|
||||
extractor.setIncludeCellComments(true);
|
||||
assertEquals(nonEventBasedExtractorOutputWithComments, extractor.getText());
|
||||
} finally {
|
||||
extractor.close();
|
||||
}
|
||||
|
||||
XSSFEventBasedExcelExtractor fixture =
|
||||
new XSSFEventBasedExcelExtractor(
|
||||
XSSFTestDataSamples.openSamplePackage("commentTest.xlsx"));
|
||||
try {
|
||||
assertEquals(expectedOutputWithoutComments, fixture.getText());
|
||||
fixture.setIncludeCellComments(true);
|
||||
assertEquals(eventBasedExtractorOutputWithComments, fixture.getText());
|
||||
} finally {
|
||||
fixture.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
BIN
test-data/spreadsheet/commentTest.xlsx
Normal file
BIN
test-data/spreadsheet/commentTest.xlsx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user