More merging, plus tests for embeded ooxml files

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@660488 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-05-27 12:36:00 +00:00
parent c132523b46
commit fb7f5240fe
8 changed files with 145 additions and 8 deletions

View File

@ -650,6 +650,7 @@ under the License.
<sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/> <sysproperty key="HSMF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hsmf/data"/>
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/> <sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
<sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/> <sysproperty key="POIFS.testdata.path" file="${main.src.test}/org/apache/poi/poifs/data"/>
<sysproperty key="OOXML.testdata.path" file="${ooxml.src.test}/org/apache/poi/ooxml/data"/>
<sysproperty key="java.awt.headless" value="true"/> <sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain" usefile="no"/> <formatter type="plain" usefile="no"/>
<formatter type="xml"/> <formatter type="xml"/>
@ -799,6 +800,7 @@ under the License.
<sysproperty key="HWPF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/> <sysproperty key="HWPF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hwpf/data"/>
<sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/> <sysproperty key="HSLF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hslf/data"/>
<sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/> <sysproperty key="HDGF.testdata.path" file="${scratchpad.src.test}/org/apache/poi/hdgf/data"/>
<sysproperty key="OOXML.testdata.path" file="${ooxml.src.test}/org/apache/poi/ooxml/data"/>
<sysproperty key="java.awt.headless" value="true"/> <sysproperty key="java.awt.headless" value="true"/>
<formatter type="plain"/> <formatter type="plain"/>
<formatter type="xml"/> <formatter type="xml"/>

View File

@ -46,6 +46,7 @@
<action dev="POI-DEVELOPERS" type="add">Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx</action> <action dev="POI-DEVELOPERS" type="add">Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx</action>
</release> </release>
<release version="3.1-final" date="2008-06-??"> <release version="3.1-final" date="2008-06-??">
<action dev="POI-DEVELOPERS" type="add">45043 - Support for getting excel cell comments when extracting text</action>
<action dev="POI-DEVELOPERS" type="add">Extend the support for specifying a policy to HSSF on missing / blank cells when fetching, to be able to specify the policy at the HSSFWorkbook level</action> <action dev="POI-DEVELOPERS" type="add">Extend the support for specifying a policy to HSSF on missing / blank cells when fetching, to be able to specify the policy at the HSSFWorkbook level</action>
<action dev="POI-DEVELOPERS" type="fix">45025 - improved FormulaParser parse error messages</action> <action dev="POI-DEVELOPERS" type="fix">45025 - improved FormulaParser parse error messages</action>
<action dev="POI-DEVELOPERS" type="fix">45046 - allowed EXTERNALBOOK(0x01AE) to be optional in the LinkTable</action> <action dev="POI-DEVELOPERS" type="fix">45046 - allowed EXTERNALBOOK(0x01AE) to be optional in the LinkTable</action>

View File

@ -43,6 +43,7 @@
<action dev="POI-DEVELOPERS" type="add">Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx</action> <action dev="POI-DEVELOPERS" type="add">Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx</action>
</release> </release>
<release version="3.1-final" date="2008-06-??"> <release version="3.1-final" date="2008-06-??">
<action dev="POI-DEVELOPERS" type="add">45043 - Support for getting excel cell comments when extracting text</action>
<action dev="POI-DEVELOPERS" type="add">Extend the support for specifying a policy to HSSF on missing / blank cells when fetching, to be able to specify the policy at the HSSFWorkbook level</action> <action dev="POI-DEVELOPERS" type="add">Extend the support for specifying a policy to HSSF on missing / blank cells when fetching, to be able to specify the policy at the HSSFWorkbook level</action>
<action dev="POI-DEVELOPERS" type="fix">45025 - improved FormulaParser parse error messages</action> <action dev="POI-DEVELOPERS" type="fix">45025 - improved FormulaParser parse error messages</action>
<action dev="POI-DEVELOPERS" type="fix">45046 - allowed EXTERNALBOOK(0x01AE) to be optional in the LinkTable</action> <action dev="POI-DEVELOPERS" type="fix">45046 - allowed EXTERNALBOOK(0x01AE) to be optional in the LinkTable</action>

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFComment;
import org.apache.poi.hssf.usermodel.HSSFRichTextString; import org.apache.poi.hssf.usermodel.HSSFRichTextString;
import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFSheet;
@ -39,6 +40,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor {
private HSSFWorkbook wb; private HSSFWorkbook wb;
private boolean includeSheetNames = true; private boolean includeSheetNames = true;
private boolean formulasNotResults = false; private boolean formulasNotResults = false;
private boolean includeCellComments = false;
public ExcelExtractor(HSSFWorkbook wb) { public ExcelExtractor(HSSFWorkbook wb) {
super(wb); super(wb);
@ -62,6 +64,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor {
public void setFormulasNotResults(boolean formulasNotResults) { public void setFormulasNotResults(boolean formulasNotResults) {
this.formulasNotResults = formulasNotResults; this.formulasNotResults = formulasNotResults;
} }
/**
* Should cell comments be included? Default is true
*/
public void setIncludeCellComments(boolean includeCellComments) {
this.includeCellComments = includeCellComments;
}
/** /**
* Retreives the text contents of the file * Retreives the text contents of the file
@ -128,6 +136,15 @@ public class ExcelExtractor extends POIOLE2TextExtractor {
break; break;
} }
// Output the comment, if requested and exists
HSSFComment comment = cell.getCellComment();
if(includeCellComments && comment != null) {
// Replace any newlines with spaces, otherwise it
// breaks the output
String commentText = comment.getString().getString().replace('\n', ' ');
text.append(" Comment by "+comment.getAuthor()+": "+commentText);
}
// Output a tab if we're not on the last cell // Output a tab if we're not on the last cell
if(outputContents && k < (lastCell-1)) { if(outputContents && k < (lastCell-1)) {
text.append("\t"); text.append("\t");

View File

@ -16,25 +16,20 @@
==================================================================== */ ==================================================================== */
package org.apache.poi.xssf.extractor; package org.apache.poi.xssf.extractor;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Comment;
import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFCell; import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException; import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package; import org.openxml4j.opc.Package;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
/** /**
* Helper class to extract text from an OOXML Excel file * Helper class to extract text from an OOXML Excel file
@ -43,6 +38,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor {
private Workbook workbook; private Workbook workbook;
private boolean includeSheetNames = true; private boolean includeSheetNames = true;
private boolean formulasNotResults = false; private boolean formulasNotResults = false;
private boolean includeCellComments = false;
public XSSFExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException { public XSSFExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
this(new XSSFWorkbook(path)); this(new XSSFWorkbook(path));
@ -79,6 +75,12 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor {
public void setFormulasNotResults(boolean formulasNotResults) { public void setFormulasNotResults(boolean formulasNotResults) {
this.formulasNotResults = formulasNotResults; this.formulasNotResults = formulasNotResults;
} }
/**
* Should cell comments be included? Default is true
*/
public void setIncludeCellComments(boolean includeCellComments) {
this.includeCellComments = includeCellComments;
}
/** /**
* Retreives the text contents of the file * Retreives the text contents of the file
@ -94,8 +96,8 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor {
for (Object rawR : sheet) { for (Object rawR : sheet) {
Row row = (Row)rawR; Row row = (Row)rawR;
for(Iterator ri = row.cellIterator(); ri.hasNext();) { for(Iterator<Cell> ri = row.cellIterator(); ri.hasNext();) {
Cell cell = (Cell)ri.next(); Cell cell = ri.next();
// Is it a formula one? // Is it a formula one?
if(cell.getCellType() == Cell.CELL_TYPE_FORMULA && formulasNotResults) { if(cell.getCellType() == Cell.CELL_TYPE_FORMULA && formulasNotResults) {
@ -107,6 +109,15 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor {
text.append(xc.getRawValue()); text.append(xc.getRawValue());
} }
// Output the comment, if requested and exists
Comment comment = cell.getCellComment();
if(includeCellComments && comment != null) {
// Replace any newlines with spaces, otherwise it
// breaks the output
String commentText = comment.getString().getString().replace('\n', ' ');
text.append(" Comment by "+comment.getAuthor()+": "+commentText);
}
if(ri.hasNext()) if(ri.hasNext())
text.append("\t"); text.append("\t");
} }

View File

@ -0,0 +1,83 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi;
import java.io.File;
import java.util.Iterator;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.XWPFDocument;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
import junit.framework.TestCase;
/**
* Class to test that we handle embeded bits in
* OOXML files properly
*/
public class TestEmbeded extends TestCase
{
public String dirname;
public void setUp() {
dirname = System.getProperty("OOXML.testdata.path");
assertNotNull(dirname);
}
public void testExcel() throws Exception {
File f = new File(dirname, "ExcelWithAttachments.xlsx");
assertTrue(f.exists());
POIXMLDocument doc = new XSSFWorkbook(Package.open(f.toString()));
test(doc, 0);
}
public void testWord() throws Exception {
File f = new File(dirname, "WordWithAttachments.docx");
assertTrue(f.exists());
POIXMLDocument doc = new XWPFDocument(Package.open(f.toString()));
test(doc, 4);
}
public void testPowerPoint() throws Exception {
File f = new File(dirname, "PPTWithAttachments.pptx");
assertTrue(f.exists());
POIXMLDocument doc = new XSLFSlideShow(Package.open(f.toString()));
test(doc, 0);
}
private void test(POIXMLDocument doc, int expectedCount) throws Exception {
assertNotNull(doc.getAllEmbedds());
assertEquals(expectedCount, doc.getAllEmbedds().size());
for(int i=0; i<doc.getAllEmbedds().size(); i++) {
PackagePart pp = doc.getAllEmbedds().get(i);
assertNotNull(pp);
byte[] b = IOUtils.toByteArray(pp.getInputStream());
assertTrue(b.length > 0);
}
}
}

View File

@ -165,6 +165,28 @@ public final class TestExcelExtractor extends TestCase {
); );
} }
public void testWithComments() throws Exception {
ExcelExtractor extractor = createExtractor("SimpleWithComments.xls");
extractor.setIncludeSheetNames(false);
// Check without comments
assertEquals(
"1.0\tone\n" +
"2.0\ttwo\n" +
"3.0\tthree\n",
extractor.getText()
);
// Now with
extractor.setIncludeCellComments(true);
assertEquals(
"1.0\tone Comment by Yegor Kozlov: Yegor Kozlov: first cell\n" +
"2.0\ttwo Comment by Yegor Kozlov: Yegor Kozlov: second cell\n" +
"3.0\tthree Comment by Yegor Kozlov: Yegor Kozlov: third cell\n",
extractor.getText()
);
}
/** /**
* Embded in a non-excel file * Embded in a non-excel file