Bugzilla 51961: support compression of temp files in SXSSF

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1212330 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yegor Kozlov 2011-12-09 11:04:22 +00:00
parent 1b19410d68
commit c46ea351f6
11 changed files with 483 additions and 269 deletions

View File

@ -739,6 +739,16 @@ import org.apache.poi.xssf.streaming.SXSSFWorkbook;
]]></source>
<p>SXSSF flushes sheet data in temporary files (a temp file per sheet) and the size of these temporary files
can grow to a very large value. For example, for a 20 MB csv data the size of the temp xml becomes more than a gigabyte.
If the size of the temp files is an issue, you can tell SXSSF to use gzip compression:
</p>
<source><![CDATA[
SXSSFWorkbook wb = new SXSSFWorkbook();
wb.setCompressTempFiles(true); // temp files will be gzipped
]]></source>
</section>
<anchor id="low_level_api" />

View File

@ -34,6 +34,7 @@
<changes>
<release version="3.8-beta5" date="2011-??-??">
<action dev="poi-developers" type="add">51961 - support compression of temp files in SXSSF </action>
<action dev="poi-developers" type="add">52268 - support cloning sheets with drawings in XSSF </action>
<action dev="poi-developers" type="add">52285 - Support XWPF smart tags text in Paragraphs</action>
<action dev="poi-developers" type="fix">51875 - More XSSF new-line in formula support</action>

View File

@ -0,0 +1,60 @@
/*
* ====================================================================
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ====================================================================
*/
package org.apache.poi.xssf.streaming;
import java.io.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
/**
* Sheet writer that supports gzip compression of the temp files.
*/
public class GZIPSheetDataWriter extends SheetDataWriter {
public GZIPSheetDataWriter() throws IOException {
super();
}
/**
* @return temp file to write sheet data
*/
public File createTempFile()throws IOException {
File fd = File.createTempFile("poi-sxssf-sheet-xml", ".gz");
fd.deleteOnExit();
return fd;
}
/**
* @return a wrapped instance of GZIPOutputStream
*/
public Writer createWriter(File fd)throws IOException {
return new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(fd)));
}
/**
* @return a GZIPInputStream stream to read the compressed temp file
*/
public InputStream getWorksheetXMLInputStream() throws IOException {
File fd = getTempFile();
return new GZIPInputStream(new FileInputStream(fd));
}
}

View File

@ -23,9 +23,9 @@ import java.util.TreeMap;
import java.util.Map;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.ss.util.SheetUtil;
import org.apache.poi.util.Internal;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.hssf.util.PaneInformation;
@ -48,14 +48,24 @@ public class SXSSFSheet implements Sheet, Cloneable
{
_workbook=workbook;
_sh=xSheet;
_writer=new SheetDataWriter();
_writer = workbook.createSheetDataWriter();
setRandomAccessWindowSize(_workbook.getRandomAccessWindowSize());
}
/**
* for testing purposes only
*/
SheetDataWriter getSheetDataWriter(){
return _writer;
}
/* Gets "<sheetData>" document fragment*/
public InputStream getWorksheetXMLInputStream() throws IOException
{
// flush all remaining data and close the temp file writer
flushRows(0);
_writer.close();
return _writer.getWorksheetXMLInputStream();
}
@ -1271,260 +1281,4 @@ public class SXSSFSheet implements Sheet, Cloneable
assert false;
return -1;
}
/*Initially copied from BigGridDemo "SpreadsheetWriter". Unlike the original code which wrote the entire document, this class only writes the "sheetData" document fragment so that it was renamed to "SheetDataWriter"*/
public class SheetDataWriter
{
private final File _fd;
private final Writer _out;
private int _rownum;
private boolean _rowContainedNullCells=false;
int _numberOfFlushedRows;
int _lowestIndexOfFlushedRows; // meaningful only of _numberOfFlushedRows>0
int _numberOfCellsOfLastFlushedRow; // meaningful only of _numberOfFlushedRows>0
public SheetDataWriter() throws IOException
{
_fd = File.createTempFile("poi-sxssf-sheet", ".xml");
_fd.deleteOnExit();
_out = new BufferedWriter(new FileWriter(_fd));
}
public int getNumberOfFlushedRows()
{
return _numberOfFlushedRows;
}
public int getNumberOfCellsOfLastFlushedRow()
{
return _numberOfCellsOfLastFlushedRow;
}
public int getLowestIndexOfFlushedRows()
{
return _lowestIndexOfFlushedRows;
}
protected void finalize() throws Throwable
{
_fd.delete();
}
public InputStream getWorksheetXMLInputStream() throws IOException
{
_out.flush();
_out.close();
return new FileInputStream(_fd);
}
/**
* Write a row to the file
*
* @param rownum 0-based row number
* @param row a row
*/
public void writeRow(int rownum,SXSSFRow row) throws IOException
{
if(_numberOfFlushedRows==0)
_lowestIndexOfFlushedRows=rownum;
_numberOfCellsOfLastFlushedRow=row.getLastCellNum();
_numberOfFlushedRows++;
beginRow(rownum,row);
Iterator<Cell> cells=row.allCellsIterator();
int columnIndex=0;
while(cells.hasNext())
{
writeCell(columnIndex++,cells.next());
}
endRow();
}
void beginRow(int rownum,SXSSFRow row) throws IOException
{
_out.write("<row r=\""+(rownum+1)+"\"");
if(row.hasCustomHeight())
_out.write(" customHeight=\"true\" ht=\""+row.getHeightInPoints()+"\"");
if(row.getZeroHeight())
_out.write(" hidden=\"true\"");
if(row.isFormatted()) {
_out.write(" s=\"" + row._style + "\"");
_out.write(" customFormat=\"1\"");
}
_out.write(">\n");
this._rownum = rownum;
_rowContainedNullCells=false;
}
void endRow() throws IOException
{
_out.write("</row>\n");
}
public void writeCell(int columnIndex,Cell cell) throws IOException
{
if(cell==null)
{
_rowContainedNullCells=true;
return;
}
String ref = new CellReference(_rownum, columnIndex).formatAsString();
_out.write("<c r=\""+ref+"\"");
CellStyle cellStyle=cell.getCellStyle();
if(cellStyle.getIndex() != 0) _out.write(" s=\""+cellStyle.getIndex()+"\"");
int cellType=cell.getCellType();
switch(cellType)
{
case Cell.CELL_TYPE_BLANK:
{
_out.write(">");
break;
}
case Cell.CELL_TYPE_FORMULA:
{
_out.write(">");
_out.write("<f>");
outputQuotedString(cell.getCellFormula());
_out.write("</f>");
switch (cell.getCachedFormulaResultType()){
case Cell.CELL_TYPE_NUMERIC:
double nval = cell.getNumericCellValue();
if(!Double.isNaN(nval)){
_out.write("<v>"+nval+"</v>");
}
break;
}
break;
}
case Cell.CELL_TYPE_STRING:
{
_out.write(" t=\"inlineStr\">");
_out.write("<is><t>");
outputQuotedString(cell.getStringCellValue());
_out.write("</t></is>");
break;
}
case Cell.CELL_TYPE_NUMERIC:
{
_out.write(" t=\"n\">");
_out.write("<v>"+cell.getNumericCellValue()+"</v>");
break;
}
case Cell.CELL_TYPE_BOOLEAN:
{
_out.write(" t=\"b\">");
_out.write("<v>"+(cell.getBooleanCellValue()?"1":"0")+"</v>");
break;
}
case Cell.CELL_TYPE_ERROR:
{
FormulaError error = FormulaError.forInt(cell.getErrorCellValue());
_out.write(" t=\"e\">");
_out.write("<v>" + error.getString() +"</v>");
break;
}
default:
{
assert false;
throw new RuntimeException("Huh?");
}
}
_out.write("</c>");
}
//Taken from jdk1.3/src/javax/swing/text/html/HTMLWriter.java
protected void outputQuotedString(String s) throws IOException
{
if(s == null || s.length() == 0) {
return;
}
char[] chars=s.toCharArray();
int last = 0;
int length=s.length();
for(int counter = 0; counter < length; counter++)
{
char c = chars[counter];
switch(c)
{
case '<':
if(counter>last)
{
_out.write(chars,last,counter-last);
}
last=counter+1;
_out.write("&lt;");
break;
case '>':
if(counter > last)
{
_out.write(chars,last,counter-last);
}
last=counter+1;
_out.write("&gt;");
break;
case '&':
if(counter>last)
{
_out.write(chars,last,counter-last);
}
last=counter+1;
_out.write("&amp;");
break;
case '"':
if (counter>last)
{
_out.write(chars,last,counter-last);
}
last=counter+1;
_out.write("&quot;");
break;
// Special characters
case '\n':
if(counter>last)
{
_out.write(chars,last,counter-last);
}
_out.write("&#xa;");
last=counter+1;
break;
case '\t':
if(counter>last)
{
_out.write(chars,last,counter-last);
}
_out.write("&#x9;");
last=counter+1;
break;
case '\r':
if(counter>last)
{
_out.write(chars,last,counter-last);
}
_out.write("&#xd;");
last=counter+1;
break;
case 0xa0:
if(counter>last)
{
_out.write(chars,last,counter-last);
}
_out.write("&#xa0;");
last=counter+1;
break;
default:
if(c<' '||c>127)
{
if(counter>last)
{
_out.write(chars,last,counter-last);
}
last=counter+1;
// If the character is outside of ascii, write the
// numeric value.
_out.write("&#");
_out.write(String.valueOf((int)c));
_out.write(";");
}
break;
}
}
if (last<length)
{
_out.write(chars,last,length-last);
}
}
}
}

View File

@ -66,6 +66,11 @@ public class SXSSFWorkbook implements Workbook
private int _randomAccessWindowSize = DEFAULT_WINDOW_SIZE;
/**
* whetehr temp files should be compressed.
*/
private boolean _compressTmpFiles = false;
/**
* Construct a new workbook
*/
@ -151,6 +156,31 @@ public class SXSSFWorkbook implements Workbook
_randomAccessWindowSize = rowAccessWindowSize;
}
/**
* Set whether temp files should be compressed.
* <p>
* SXSSF writes sheet data in temporary files (a temp file per-sheet)
* and the size of these temp files can grow to to a very large size,
* e.g. for a 20 MB csv data the size of the temp xml file become few GB large.
* If the "compress" flag is set to <code>true</code> then the temporary XML is gzipped.
* </p>
* <p>
* Please note the the "compress" option may cause performance penalty.
* </p>
* @param compress whether to compress temp files
*/
public void setCompressTempFiles(boolean compress){
_compressTmpFiles = compress;
}
SheetDataWriter createSheetDataWriter() throws IOException {
if(_compressTmpFiles) {
return new GZIPSheetDataWriter();
} else {
return new SheetDataWriter();
}
}
XSSFSheet getXSSFSheet(SXSSFSheet sheet)
{
XSSFSheet result=_sxFromXHash.get(sheet);

View File

@ -0,0 +1,300 @@
/*
* ====================================================================
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ====================================================================
*/
package org.apache.poi.xssf.streaming;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.FormulaError;
import org.apache.poi.ss.util.CellReference;
import java.io.*;
import java.util.Iterator;
/**
* Initially copied from BigGridDemo "SpreadsheetWriter".
* Unlike the original code which wrote the entire document,
* this class only writes the "sheetData" document fragment
* so that it was renamed to "SheetDataWriter"
*/
public class SheetDataWriter {
private final File _fd;
private final Writer _out;
private int _rownum;
private boolean _rowContainedNullCells = false;
int _numberOfFlushedRows;
int _lowestIndexOfFlushedRows; // meaningful only of _numberOfFlushedRows>0
int _numberOfCellsOfLastFlushedRow; // meaningful only of _numberOfFlushedRows>0
public SheetDataWriter() throws IOException {
_fd = createTempFile();
_out = createWriter(_fd);
}
/**
* Create a temp file to write sheet data.
* By default, temp files are created in the default temporary-file directory
* with a prefix "poi-sxssf-sheet" and suffix ".xml". Subclasses can override
* it and specify a different temp directory or filename or suffix, e.g. <code>.gz</code>
*
* @return temp file to write sheet data
*/
public File createTempFile()throws IOException {
File fd = File.createTempFile("poi-sxssf-sheet", ".xml");
fd.deleteOnExit();
return fd;
}
/**
* Create a writer for the sheet data.
*
* @param fd the file to write to
* @return
*/
public Writer createWriter(File fd)throws IOException {
return new BufferedWriter(new FileWriter(fd));
}
/**
* flush and close the temp data writer.
* This method <em>must</em> be invoked before calling {@link #getWorksheetXMLInputStream()}
*/
public void close() throws IOException{
_out.flush();
_out.close();
}
File getTempFile(){
return _fd;
}
/**
* @return a stream to read temp file with the sheet data
*/
public InputStream getWorksheetXMLInputStream() throws IOException {
File fd = getTempFile();
return new FileInputStream(fd);
}
public int getNumberOfFlushedRows() {
return _numberOfFlushedRows;
}
public int getNumberOfCellsOfLastFlushedRow() {
return _numberOfCellsOfLastFlushedRow;
}
public int getLowestIndexOfFlushedRows() {
return _lowestIndexOfFlushedRows;
}
protected void finalize() throws Throwable {
_fd.delete();
}
/**
* Write a row to the file
*
* @param rownum 0-based row number
* @param row a row
*/
public void writeRow(int rownum, SXSSFRow row) throws IOException {
if (_numberOfFlushedRows == 0)
_lowestIndexOfFlushedRows = rownum;
_numberOfCellsOfLastFlushedRow = row.getLastCellNum();
_numberOfFlushedRows++;
beginRow(rownum, row);
Iterator<Cell> cells = row.allCellsIterator();
int columnIndex = 0;
while (cells.hasNext()) {
writeCell(columnIndex++, cells.next());
}
endRow();
}
void beginRow(int rownum, SXSSFRow row) throws IOException {
_out.write("<row r=\"" + (rownum + 1) + "\"");
if (row.hasCustomHeight())
_out.write(" customHeight=\"true\" ht=\"" + row.getHeightInPoints() + "\"");
if (row.getZeroHeight())
_out.write(" hidden=\"true\"");
if (row.isFormatted()) {
_out.write(" s=\"" + row._style + "\"");
_out.write(" customFormat=\"1\"");
}
_out.write(">\n");
this._rownum = rownum;
_rowContainedNullCells = false;
}
void endRow() throws IOException {
_out.write("</row>\n");
}
public void writeCell(int columnIndex, Cell cell) throws IOException {
if (cell == null) {
_rowContainedNullCells = true;
return;
}
String ref = new CellReference(_rownum, columnIndex).formatAsString();
_out.write("<c r=\"" + ref + "\"");
CellStyle cellStyle = cell.getCellStyle();
if (cellStyle.getIndex() != 0) _out.write(" s=\"" + cellStyle.getIndex() + "\"");
int cellType = cell.getCellType();
switch (cellType) {
case Cell.CELL_TYPE_BLANK: {
_out.write(">");
break;
}
case Cell.CELL_TYPE_FORMULA: {
_out.write(">");
_out.write("<f>");
outputQuotedString(cell.getCellFormula());
_out.write("</f>");
switch (cell.getCachedFormulaResultType()) {
case Cell.CELL_TYPE_NUMERIC:
double nval = cell.getNumericCellValue();
if (!Double.isNaN(nval)) {
_out.write("<v>" + nval + "</v>");
}
break;
}
break;
}
case Cell.CELL_TYPE_STRING: {
_out.write(" t=\"inlineStr\">");
_out.write("<is><t>");
outputQuotedString(cell.getStringCellValue());
_out.write("</t></is>");
break;
}
case Cell.CELL_TYPE_NUMERIC: {
_out.write(" t=\"n\">");
_out.write("<v>" + cell.getNumericCellValue() + "</v>");
break;
}
case Cell.CELL_TYPE_BOOLEAN: {
_out.write(" t=\"b\">");
_out.write("<v>" + (cell.getBooleanCellValue() ? "1" : "0") + "</v>");
break;
}
case Cell.CELL_TYPE_ERROR: {
FormulaError error = FormulaError.forInt(cell.getErrorCellValue());
_out.write(" t=\"e\">");
_out.write("<v>" + error.getString() + "</v>");
break;
}
default: {
assert false;
throw new RuntimeException("Huh?");
}
}
_out.write("</c>");
}
//Taken from jdk1.3/src/javax/swing/text/html/HTMLWriter.java
protected void outputQuotedString(String s) throws IOException {
if (s == null || s.length() == 0) {
return;
}
char[] chars = s.toCharArray();
int last = 0;
int length = s.length();
for (int counter = 0; counter < length; counter++) {
char c = chars[counter];
switch (c) {
case '<':
if (counter > last) {
_out.write(chars, last, counter - last);
}
last = counter + 1;
_out.write("&lt;");
break;
case '>':
if (counter > last) {
_out.write(chars, last, counter - last);
}
last = counter + 1;
_out.write("&gt;");
break;
case '&':
if (counter > last) {
_out.write(chars, last, counter - last);
}
last = counter + 1;
_out.write("&amp;");
break;
case '"':
if (counter > last) {
_out.write(chars, last, counter - last);
}
last = counter + 1;
_out.write("&quot;");
break;
// Special characters
case '\n':
if (counter > last) {
_out.write(chars, last, counter - last);
}
_out.write("&#xa;");
last = counter + 1;
break;
case '\t':
if (counter > last) {
_out.write(chars, last, counter - last);
}
_out.write("&#x9;");
last = counter + 1;
break;
case '\r':
if (counter > last) {
_out.write(chars, last, counter - last);
}
_out.write("&#xd;");
last = counter + 1;
break;
case 0xa0:
if (counter > last) {
_out.write(chars, last, counter - last);
}
_out.write("&#xa0;");
last = counter + 1;
break;
default:
if (c < ' ' || c > 127) {
if (counter > last) {
_out.write(chars, last, counter - last);
}
last = counter + 1;
// If the character is outside of ascii, write the
// numeric value.
_out.write("&#");
_out.write(String.valueOf((int) c));
_out.write(";");
}
break;
}
}
if (last < length) {
_out.write(chars, last, length - last);
}
}
}

View File

@ -17,7 +17,7 @@
* ====================================================================
*/
package org.apache.poi.xssf.usermodel.streaming;
package org.apache.poi.xssf.streaming;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.SXSSFITestDataProvider;

View File

@ -17,9 +17,8 @@
* ====================================================================
*/
package org.apache.poi.xssf.usermodel.streaming;
package org.apache.poi.xssf.streaming;
import org.apache.poi.ss.usermodel.BaseTestCell;
import org.apache.poi.ss.usermodel.BaseTestHyperlink;
import org.apache.poi.xssf.SXSSFITestDataProvider;

View File

@ -17,7 +17,7 @@
* ====================================================================
*/
package org.apache.poi.xssf.usermodel.streaming;
package org.apache.poi.xssf.streaming;
import org.apache.poi.ss.usermodel.BaseTestRow;
import org.apache.poi.xssf.SXSSFITestDataProvider;

View File

@ -17,7 +17,7 @@
* ====================================================================
*/
package org.apache.poi.xssf.usermodel.streaming;
package org.apache.poi.xssf.streaming;
import org.apache.poi.ss.usermodel.BaseTestSheet;
import org.apache.poi.xssf.SXSSFITestDataProvider;

View File

@ -17,16 +17,15 @@
* ====================================================================
*/
package org.apache.poi.xssf.usermodel.streaming;
package org.apache.poi.xssf.streaming;
import org.apache.poi.ss.usermodel.BaseTestWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.SXSSFITestDataProvider;
import org.apache.poi.xssf.streaming.SXSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.File;
public final class TestSXSSFWorkbook extends BaseTestWorkbook {
public TestSXSSFWorkbook() {
@ -137,4 +136,65 @@ public final class TestSXSSFWorkbook extends BaseTestWorkbook {
assertNotNull(cell3_1_1);
assertEquals("value 3_1_1", cell3_1_1.getStringCellValue());
}
public void testSheetdataWriter(){
SXSSFWorkbook wb = new SXSSFWorkbook();
SXSSFSheet sh = (SXSSFSheet)wb.createSheet();
SheetDataWriter wr = sh.getSheetDataWriter();
assertTrue(wr.getClass() == SheetDataWriter.class);
File tmp = wr.getTempFile();
assertTrue(tmp.getName().startsWith("poi-sxssf-sheet"));
assertTrue(tmp.getName().endsWith(".xml"));
wb = new SXSSFWorkbook();
wb.setCompressTempFiles(true);
sh = (SXSSFSheet)wb.createSheet();
wr = sh.getSheetDataWriter();
assertTrue(wr.getClass() == GZIPSheetDataWriter.class);
tmp = wr.getTempFile();
assertTrue(tmp.getName().startsWith("poi-sxssf-sheet-xml"));
assertTrue(tmp.getName().endsWith(".gz"));
}
public void testGZipSheetdataWriter(){
Workbook wb = new SXSSFWorkbook();
((SXSSFWorkbook)wb).setCompressTempFiles(true);
int rowNum = 10000;
int sheetNum = 5;
for(int i = 0; i < sheetNum; i++){
Sheet sh = wb.createSheet("sheet" + i);
for(int j = 0; j < rowNum; j++){
Row row = sh.createRow(j);
Cell cell1 = row.createCell(0);
cell1.setCellValue(new CellReference(cell1).formatAsString());
Cell cell2 = row.createCell(1);
cell2.setCellValue(i);
Cell cell3 = row.createCell(2);
cell3.setCellValue(j);
}
}
wb = SXSSFITestDataProvider.instance.writeOutAndReadBack(wb);
for(int i = 0; i < sheetNum; i++){
Sheet sh = wb.getSheetAt(i);
assertEquals("sheet" + i, sh.getSheetName());
for(int j = 0; j < rowNum; j++){
Row row = sh.getRow(j);
assertNotNull("row[" + j + "]", row);
Cell cell1 = row.getCell(0);
assertEquals(new CellReference(cell1).formatAsString(), cell1.getStringCellValue());
Cell cell2 = row.getCell(1);
assertEquals(i, (int)cell2.getNumericCellValue());
Cell cell3 = row.getCell(2);
assertEquals(j, (int)cell3.getNumericCellValue());
}
}
}
}