Make a start on a text extractor for xlsx files

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607058 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2007-12-27 12:40:05 +00:00
parent ae3bd54533
commit 59f37853cd
5 changed files with 259 additions and 1 deletions

View File

@ -16,6 +16,8 @@
==================================================================== */ ==================================================================== */
package org.apache.poi; package org.apache.poi;
import org.apache.poi.hxf.HXFDocument;
/** /**
* Parent class of all UserModel POI XML (ooxml) * Parent class of all UserModel POI XML (ooxml)
* implementations. * implementations.
@ -23,5 +25,9 @@ package org.apache.poi;
* for the XML based classes. * for the XML based classes.
*/ */
public abstract class POIXMLDocument { public abstract class POIXMLDocument {
// TODO private HXFDocument document;
protected POIXMLDocument(HXFDocument document) {
this.document = document;
}
} }

View File

@ -0,0 +1,31 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi;
public abstract class POIXMLTextExtractor extends POITextExtractor {
/** The POIXMLDocument that's open */
protected POIXMLDocument document;
/**
* Creates a new text extractor for the given document
*/
public POIXMLTextExtractor(POIXMLDocument document) {
super(null);
this.document = document;
}
}

View File

@ -0,0 +1,113 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.extractor;
import java.io.IOException;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.HSSFXML;
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCellFormula;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
public class HXFExcelExtractor extends POIXMLTextExtractor {
private HSSFXMLWorkbook workbook;
private boolean includeSheetNames = true;
private boolean formulasNotResults = false;
public HXFExcelExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
this(new HSSFXMLWorkbook(
new HSSFXML(container)
));
}
public HXFExcelExtractor(HSSFXMLWorkbook workbook) {
super(workbook);
this.workbook = workbook;
}
/**
* Should sheet names be included? Default is true
*/
public void setIncludeSheetNames(boolean includeSheetNames) {
this.includeSheetNames = includeSheetNames;
}
/**
* Should we return the formula itself, and not
* the result it produces? Default is false
*/
public void setFormulasNotResults(boolean formulasNotResults) {
this.formulasNotResults = formulasNotResults;
}
/**
* Retreives the text contents of the file
*/
public String getText() {
StringBuffer text = new StringBuffer();
CTSheet[] sheetRefs =
workbook._getHSSFXML().getSheetReferences().getSheetArray();
for(int i=0; i<sheetRefs.length; i++) {
try {
CTWorksheet sheet =
workbook._getHSSFXML().getSheet(sheetRefs[i]);
CTRow[] rows =
sheet.getSheetData().getRowArray();
if(i > 0) {
text.append("\n");
}
if(includeSheetNames) {
text.append(sheetRefs[i].getName() + "\n");
}
for(int j=0; j<rows.length; j++) {
CTCell[] cells = rows[j].getCArray();
for(int k=0; k<cells.length; k++) {
CTCell cell = cells[k];
if(k > 0) {
text.append("\t");
}
// Is it a formula one?
if(cell.getF() != null) {
if(formulasNotResults) {
text.append(cell.getF().getStringValue());
} else {
text.append(cell.getV());
}
} else {
// Probably just want the v value
text.append(cell.getV());
}
}
text.append("\n");
}
} catch(Exception e) {
throw new RuntimeException(e);
}
}
return text.toString();
}
}

View File

@ -0,0 +1,33 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.usermodel;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hssf.HSSFXML;
public class HSSFXMLWorkbook extends POIXMLDocument {
private HSSFXML hssfXML;
public HSSFXMLWorkbook(HSSFXML xml) {
super(xml);
this.hssfXML = xml;
}
public HSSFXML _getHSSFXML() {
return hssfXML;
}
}

View File

@ -0,0 +1,75 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.extractor;
import java.io.File;
import org.apache.poi.hssf.HSSFXML;
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
import org.apache.poi.hxf.HXFDocument;
import junit.framework.TestCase;
/**
* Tests for HXFExcelExtractor
*/
public class TestHXFExcelExtractor extends TestCase {
/**
* A very simple file
*/
private HSSFXML xmlA;
/**
* A fairly complex file
*/
private HSSFXML xmlB;
protected void setUp() throws Exception {
super.setUp();
File fileA = new File(
System.getProperty("HSSF.testdata.path") +
File.separator + "sample.xlsx"
);
File fileB = new File(
System.getProperty("HSSF.testdata.path") +
File.separator + "AverageTaxRates.xlsx"
);
xmlA = new HSSFXML(HXFDocument.openPackage(fileA));
xmlB = new HSSFXML(HXFDocument.openPackage(fileB));
}
/**
* Get text out of the simple file
*/
public void testGetSimpleText() throws Exception {
new HXFExcelExtractor(xmlA.getPackage());
new HXFExcelExtractor(new HSSFXMLWorkbook(xmlA));
HXFExcelExtractor extractor =
new HXFExcelExtractor(xmlA.getPackage());
extractor.getText();
String text = extractor.getText();
assertTrue(text.length() > 0);
System.err.println(text);
// Check sheet names
assertTrue(text.startsWith("Sheet1"));
assertTrue(text.endsWith("Sheet3\n"));
}
}