2007-12-27 07:40:05 -05:00
|
|
|
/* ====================================================================
|
|
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
|
|
this work for additional information regarding copyright ownership.
|
|
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
(the "License"); you may not use this file except in compliance with
|
|
|
|
the License. You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
==================================================================== */
|
2008-03-08 12:21:29 -05:00
|
|
|
package org.apache.poi.xssf.extractor;
|
2007-12-27 07:40:05 -05:00
|
|
|
|
2007-12-30 11:53:42 -05:00
|
|
|
import java.io.File;
|
2007-12-27 07:40:05 -05:00
|
|
|
import java.io.IOException;
|
|
|
|
|
|
|
|
import org.apache.poi.POIXMLTextExtractor;
|
2008-03-08 12:21:29 -05:00
|
|
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
2007-12-27 07:40:05 -05:00
|
|
|
import org.apache.xmlbeans.XmlException;
|
|
|
|
import org.openxml4j.exceptions.OpenXML4JException;
|
|
|
|
import org.openxml4j.opc.Package;
|
|
|
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell;
|
|
|
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow;
|
|
|
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
|
|
|
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
|
|
|
|
|
2007-12-30 11:53:42 -05:00
|
|
|
/**
|
|
|
|
* Helper class to extract text from an OOXML Excel file
|
|
|
|
*/
|
2008-03-08 12:21:29 -05:00
|
|
|
public class XSSFExcelExtractor extends POIXMLTextExtractor {
|
|
|
|
private XSSFWorkbook workbook;
|
2007-12-27 07:40:05 -05:00
|
|
|
private boolean includeSheetNames = true;
|
|
|
|
private boolean formulasNotResults = false;
|
|
|
|
|
2008-03-08 12:21:29 -05:00
|
|
|
public XSSFExcelExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
|
|
|
|
this(new XSSFWorkbook(container));
|
2007-12-27 07:40:05 -05:00
|
|
|
}
|
2008-03-08 12:21:29 -05:00
|
|
|
public XSSFExcelExtractor(XSSFWorkbook workbook) {
|
2007-12-27 07:40:05 -05:00
|
|
|
super(workbook);
|
|
|
|
this.workbook = workbook;
|
|
|
|
}
|
2007-12-30 11:53:42 -05:00
|
|
|
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
|
|
if(args.length < 1) {
|
|
|
|
System.err.println("Use:");
|
|
|
|
System.err.println(" HXFExcelExtractor <filename.xlsx>");
|
|
|
|
System.exit(1);
|
|
|
|
}
|
|
|
|
POIXMLTextExtractor extractor =
|
|
|
|
new HXFExcelExtractor(HXFDocument.openPackage(
|
|
|
|
new File(args[0])
|
|
|
|
));
|
|
|
|
System.out.println(extractor.getText());
|
|
|
|
}
|
2007-12-27 07:40:05 -05:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Should sheet names be included? Default is true
|
|
|
|
*/
|
|
|
|
public void setIncludeSheetNames(boolean includeSheetNames) {
|
|
|
|
this.includeSheetNames = includeSheetNames;
|
|
|
|
}
|
|
|
|
/**
|
|
|
|
* Should we return the formula itself, and not
|
|
|
|
* the result it produces? Default is false
|
|
|
|
*/
|
|
|
|
public void setFormulasNotResults(boolean formulasNotResults) {
|
|
|
|
this.formulasNotResults = formulasNotResults;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Retreives the text contents of the file
|
|
|
|
*/
|
|
|
|
public String getText() {
|
|
|
|
StringBuffer text = new StringBuffer();
|
|
|
|
|
|
|
|
CTSheet[] sheetRefs =
|
|
|
|
workbook._getHSSFXML().getSheetReferences().getSheetArray();
|
|
|
|
for(int i=0; i<sheetRefs.length; i++) {
|
|
|
|
try {
|
|
|
|
CTWorksheet sheet =
|
|
|
|
workbook._getHSSFXML().getSheet(sheetRefs[i]);
|
|
|
|
CTRow[] rows =
|
|
|
|
sheet.getSheetData().getRowArray();
|
|
|
|
|
|
|
|
if(i > 0) {
|
|
|
|
text.append("\n");
|
|
|
|
}
|
|
|
|
if(includeSheetNames) {
|
|
|
|
text.append(sheetRefs[i].getName() + "\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
for(int j=0; j<rows.length; j++) {
|
|
|
|
CTCell[] cells = rows[j].getCArray();
|
|
|
|
for(int k=0; k<cells.length; k++) {
|
|
|
|
CTCell cell = cells[k];
|
|
|
|
if(k > 0) {
|
|
|
|
text.append("\t");
|
|
|
|
}
|
|
|
|
|
2007-12-27 08:02:17 -05:00
|
|
|
boolean done = false;
|
|
|
|
|
2007-12-27 07:40:05 -05:00
|
|
|
// Is it a formula one?
|
|
|
|
if(cell.getF() != null) {
|
|
|
|
if(formulasNotResults) {
|
|
|
|
text.append(cell.getF().getStringValue());
|
2007-12-27 08:02:17 -05:00
|
|
|
done = true;
|
2007-12-27 07:40:05 -05:00
|
|
|
}
|
2007-12-27 08:02:17 -05:00
|
|
|
}
|
|
|
|
if(!done) {
|
2008-01-08 12:28:39 -05:00
|
|
|
HSSFXMLCell uCell = new HSSFXMLCell(cell, workbook);
|
2007-12-27 08:02:17 -05:00
|
|
|
text.append(uCell.getStringValue());
|
2007-12-27 07:40:05 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
text.append("\n");
|
|
|
|
}
|
|
|
|
} catch(Exception e) {
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return text.toString();
|
|
|
|
}
|
|
|
|
}
|