code improvements to RecordFactoryInputStream

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@801850 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Josh Micich 2009-08-07 00:21:00 +00:00
parent acc07e2d6b
commit 755b86af67
3 changed files with 252 additions and 268 deletions

View File

@ -31,7 +31,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* processWorkbookEvents along with a request.
*
* This will cause your file to be processed a record at a time. Each record with
* a static id matching one that you have registed in your HSSFRequest will be passed
* a static id matching one that you have registered in your HSSFRequest will be passed
* to your associated HSSFListener.
*
* @see org.apache.poi.hssf.dev.EFHSSF
@ -39,115 +39,98 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* @author Andrew C. Oliver (acoliver at apache dot org)
* @author Carey Sublette (careysub@earthling.net)
*/
public class HSSFEventFactory {
/** Creates a new instance of HSSFEventFactory */
public HSSFEventFactory() {
// no instance fields
}
public class HSSFEventFactory
{
/** Creates a new instance of HSSFEventFactory */
public HSSFEventFactory()
{
}
/**
* Processes a file into essentially record events.
*
* @param req an Instance of HSSFRequest which has your registered listeners
* @param fs a POIFS filesystem containing your workbook
*/
public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs)
throws IOException
{
InputStream in = fs.createDocumentInputStream("Workbook");
processEvents(req, in);
}
/**
/**
* Processes a file into essentially record events.
*
* @param req an Instance of HSSFRequest which has your registered listeners
* @param fs a POIFS filesystem containing your workbook
* @return numeric user-specified result code.
* @param req an Instance of HSSFRequest which has your registered listeners
* @param fs a POIFS filesystem containing your workbook
*/
public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) throws IOException {
InputStream in = fs.createDocumentInputStream("Workbook");
processEvents(req, in);
}
/**
* Processes a file into essentially record events.
*
* @param req an Instance of HSSFRequest which has your registered listeners
* @param fs a POIFS filesystem containing your workbook
* @return numeric user-specified result code.
*/
public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs)
throws IOException, HSSFUserException
{
throws IOException, HSSFUserException {
InputStream in = fs.createDocumentInputStream("Workbook");
return abortableProcessEvents(req, in);
}
}
/**
* Processes a DocumentInputStream into essentially Record events.
*
* If an <code>AbortableHSSFListener</code> causes a halt to processing during this call
* the method will return just as with <code>abortableProcessEvents</code>, but no
* user code or <code>HSSFUserException</code> will be passed back.
*
* @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String)
* @param req an Instance of HSSFRequest which has your registered listeners
* @param in a DocumentInputStream obtained from POIFS's POIFSFileSystem object
*/
public void processEvents(HSSFRequest req, InputStream in)
throws IOException
{
try
{
/**
* Processes a DocumentInputStream into essentially Record events.
*
* If an <code>AbortableHSSFListener</code> causes a halt to processing during this call
* the method will return just as with <code>abortableProcessEvents</code>, but no
* user code or <code>HSSFUserException</code> will be passed back.
*
* @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String)
* @param req an Instance of HSSFRequest which has your registered listeners
* @param in a DocumentInputStream obtained from POIFS's POIFSFileSystem object
*/
public void processEvents(HSSFRequest req, InputStream in) {
try {
genericProcessEvents(req, new RecordInputStream(in));
} catch (HSSFUserException hue) {
/*If an HSSFUserException user exception is thrown, ignore it.*/
}
catch (HSSFUserException hue)
{/*If an HSSFUserException user exception is thrown, ignore it.*/ }
}
/**
* Processes a DocumentInputStream into essentially Record events.
*
* @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String)
* @param req an Instance of HSSFRequest which has your registered listeners
* @param in a DocumentInputStream obtained from POIFS's POIFSFileSystem object
* @return numeric user-specified result code.
*/
public short abortableProcessEvents(HSSFRequest req, InputStream in)
throws IOException, HSSFUserException
{
return genericProcessEvents(req, new RecordInputStream(in));
}
/**
/**
* Processes a DocumentInputStream into essentially Record events.
*
* @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String)
* @param req an Instance of HSSFRequest which has your registered listeners
* @param in a DocumentInputStream obtained from POIFS's POIFSFileSystem object
* @return numeric user-specified result code.
* @param req an Instance of HSSFRequest which has your registered listeners
* @param in a DocumentInputStream obtained from POIFS's POIFSFileSystem object
* @return numeric user-specified result code.
*/
public short abortableProcessEvents(HSSFRequest req, InputStream in)
throws HSSFUserException {
return genericProcessEvents(req, new RecordInputStream(in));
}
/**
* Processes a DocumentInputStream into essentially Record events.
*
* @see org.apache.poi.poifs.filesystem.POIFSFileSystem#createDocumentInputStream(String)
* @param req an Instance of HSSFRequest which has your registered listeners
* @param in a DocumentInputStream obtained from POIFS's POIFSFileSystem object
* @return numeric user-specified result code.
*/
protected short genericProcessEvents(HSSFRequest req, RecordInputStream in)
throws IOException, HSSFUserException
{
boolean going = true;
throws HSSFUserException {
short userCode = 0;
Record r = null;
// Create a new RecordStream and use that
RecordFactoryInputStream recordStream = new RecordFactoryInputStream(in);
RecordFactoryInputStream recordStream = new RecordFactoryInputStream(in, false);
// Process each record as they come in
while(going) {
r = recordStream.nextRecord();
if(r != null) {
userCode = req.processRecord(r);
if (userCode != 0) break;
} else {
going = false;
while(true) {
Record r = recordStream.nextRecord();
if(r == null) {
break;
}
userCode = req.processRecord(r);
if (userCode != 0) {
break;
}
}
// All done, return our last code
return userCode;
}
}
}

View File

@ -369,12 +369,11 @@ public final class RecordFactory {
public static List<Record> createRecords(InputStream in) throws RecordFormatException {
List<Record> records = new ArrayList<Record>(NUM_RECORDS);
RecordFactoryInputStream recStream = new RecordFactoryInputStream(new RecordInputStream(in));
recStream.setIncludeContinueRecords(true);
RecordFactoryInputStream recStream = new RecordFactoryInputStream(new RecordInputStream(in), true);
Record record;
Record record;
while ((record = recStream.nextRecord())!=null) {
records.add(record);
records.add(record);
}
return records;

View File

@ -19,10 +19,6 @@ package org.apache.poi.hssf.record;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
/**
* A stream based way to get at complete records, with
* as low a memory footprint as possible.
@ -34,203 +30,209 @@ import java.util.List;
* them, but this does allow for a "pull" style of coding.
*/
public class RecordFactoryInputStream {
private final RecordInputStream recStream;
/**
* Have we returned all the records there are?
*/
private boolean complete = false;
private final RecordInputStream _recStream;
private final boolean _shouldIncludeContinueRecords;
/**
* Sometimes we end up with a bunch of
* records. When we do, these should
* be returned before the next normal
* record processing occurs (i.e. before
* we check for continue records and
* return rec)
*/
private final LinkedList bonusRecords = new LinkedList();
/**
* Temporarily stores a group of {@link NumberRecord}s. This is uses when the most
* recently read underlying record is a {@link MulRKRecord}
*/
private NumberRecord[] _multipleNumberRecords;
/**
* The most recent record that we gave to the user
*/
private Record lastRecord = null;
/**
* The most recent DrawingRecord seen
*/
private DrawingRecord lastDrawingRecord = new DrawingRecord();
/**
* used to help iterating over multiple number records
*/
private int _multipleNumberRecordIndex = -1;
private int bofDepth = 0;
/**
* The most recent record that we gave to the user
*/
private Record _lastRecord = null;
/**
* The most recent DrawingRecord seen
*/
private DrawingRecord _lastDrawingRecord = new DrawingRecord();
private boolean lastRecordWasEOFLevelZero = false;
private int _bofDepth;
private boolean includeContinueRecords = false;
private boolean _lastRecordWasEOFLevelZero;
public RecordFactoryInputStream(RecordInputStream inp) {
recStream = inp;
}
/**
* Returns the next (complete) record from the
* stream, or null if there are no more.
*/
public Record nextRecord() {
Record r = null;
/**
* @param shouldIncludeContinueRecords caller can pass <code>false</code> if loose
* {@link ContinueRecord}s should be skipped (this is sometimes useful in event based
* processing).
*/
public RecordFactoryInputStream(RecordInputStream inp, boolean shouldIncludeContinueRecords) {
_recStream = inp;
_shouldIncludeContinueRecords = shouldIncludeContinueRecords;
// Loop until we get something
while (r == null && !complete) {
// Are there any bonus records that we need to
// return?
r = getBonusRecord();
/*
* How to recognise end of stream?
* In the best case, the underlying input stream (in) ends just after the last EOF record
* Usually however, the stream is padded with an arbitrary byte count. Excel and most apps
* reliably use zeros for padding and if this were always the case, this code could just
* skip all the (zero sized) records with sid==0. However, bug 46987 shows a file with
* non-zero padding that is read OK by Excel (Excel also fixes the padding).
*
* So to properly detect the workbook end of stream, this code has to identify the last
* EOF record. This is not so easy because the worbook bof+eof pair do not bracket the
* whole stream. The worksheets follow the workbook, but it is not easy to tell how many
* sheet sub-streams should be present. Hence we are looking for an EOF record that is not
* immediately followed by a BOF record. One extra complication is that bof+eof sub-
* streams can be nested within worksheet streams and it's not clear in these cases what
* record might follow any EOF record. So we also need to keep track of the bof/eof
* nesting level.
*/
_bofDepth=0;
_lastRecordWasEOFLevelZero = false;
}
// If not, ask for the next real record
if (r == null) {
r = getNextRecord();
}
}
/**
* Returns the next (complete) record from the
* stream, or null if there are no more.
*/
public Record nextRecord() {
Record r;
r = getNextMultipleNumberRecord();
if (r != null) {
// found a NumberRecord (expanded from a recent MULRK record)
return r;
}
while (true) {
if (!_recStream.hasNextRecord()) {
// recStream is exhausted;
return null;
}
// All done
return r;
}
// step underlying RecordInputStream to the next record
_recStream.nextRecord();
/**
* If there are any "bonus" records, that should
* be returned before processing new ones,
* grabs the next and returns it.
* If not, returns null;
*/
private Record getBonusRecord() {
if (!bonusRecords.isEmpty()) {
return (Record) bonusRecords.removeFirst();
}
return null;
}
if (_lastRecordWasEOFLevelZero) {
// Potential place for ending the workbook stream
// Check that the next record is not BOFRecord(0x0809)
// Normally the input stream contains only zero padding after the last EOFRecord,
// but bug 46987 suggests that the padding may be garbage.
// This code relies on the padding bytes not starting with BOFRecord.sid
if (_recStream.getSid() != BOFRecord.sid) {
return null;
}
// else - another sheet substream starting here
}
/**
* Returns the next available record, or null if
* this pass didn't return a record that's
* suitable for returning (eg was a continue record).
*/
private Record getNextRecord() {
/*
* How to recognise end of stream?
* In the best case, the underlying input stream (in) ends just after the last EOF record
* Usually however, the stream is padded with an arbitrary byte count. Excel and most apps
* reliably use zeros for padding and if this were always the case, this code could just
* skip all the (zero sized) records with sid==0. However, bug 46987 shows a file with
* non-zero padding that is read OK by Excel (Excel also fixes the padding).
*
* So to properly detect the workbook end of stream, this code has to identify the last
* EOF record. This is not so easy because the worbook bof+eof pair do not bracket the
* whole stream. The worksheets follow the workbook, but it is not easy to tell how many
* sheet sub-streams should be present. Hence we are looking for an EOF record that is not
* immediately followed by a BOF record. One extra complication is that bof+eof sub-
* streams can be nested within worksheet streams and it's not clear in these cases what
* record might follow any EOF record. So we also need to keep track of the bof/eof
* nesting level.
*/
r = readNextRecord();
if (r == null) {
// some record types may get skipped (e.g. DBCellRecord and ContinueRecord)
continue;
}
return r;
}
}
if (recStream.hasNextRecord()) {
// Grab our next record
recStream.nextRecord();
/**
* @return the next {@link NumberRecord} from the multiple record group as expanded from
* a recently read {@link MulRKRecord}. <code>null</code> if not present.
*/
private NumberRecord getNextMultipleNumberRecord() {
if (_multipleNumberRecords != null) {
int ix = _multipleNumberRecordIndex;
if (ix < _multipleNumberRecords.length) {
NumberRecord result = _multipleNumberRecords[ix];
_multipleNumberRecordIndex = ix + 1;
return result;
}
_multipleNumberRecordIndex = -1;
_multipleNumberRecords = null;
}
return null;
}
if (lastRecordWasEOFLevelZero && recStream.getSid() != BOFRecord.sid) {
// Normally InputStream (in) contains only zero padding after this point
complete = true;
return null;
}
/**
* @return the next available record, or <code>null</code> if
* this pass didn't return a record that's
* suitable for returning (eg was a continue record).
*/
private Record readNextRecord() {
Record record = RecordFactory.createSingleRecord(recStream);
lastRecordWasEOFLevelZero = false;
Record record = RecordFactory.createSingleRecord(_recStream);
_lastRecordWasEOFLevelZero = false;
if (record instanceof BOFRecord) {
bofDepth++;
return record;
}
if (record instanceof BOFRecord) {
_bofDepth++;
return record;
}
if (record instanceof EOFRecord) {
bofDepth--;
if (bofDepth < 1) {
lastRecordWasEOFLevelZero = true;
}
if (record instanceof EOFRecord) {
_bofDepth--;
if (_bofDepth < 1) {
_lastRecordWasEOFLevelZero = true;
}
return record;
}
return record;
}
if (record instanceof DBCellRecord) {
// Not needed by POI. Regenerated from scratch by POI when spreadsheet is written
return null;
}
if (record instanceof DBCellRecord) {
// Not needed by POI. Regenerated from scratch by POI when spreadsheet is written
return null;
}
if (record instanceof RKRecord) {
return RecordFactory.convertToNumberRecord((RKRecord) record);
}
if (record instanceof RKRecord) {
return RecordFactory.convertToNumberRecord((RKRecord) record);
}
if (record instanceof MulRKRecord) {
NumberRecord[] records = RecordFactory.convertRKRecords((MulRKRecord) record);
if (record instanceof MulRKRecord) {
NumberRecord[] records = RecordFactory.convertRKRecords((MulRKRecord) record);
List<NumberRecord> list = Arrays.asList(records);
bonusRecords.addAll(list.subList(1, list.size()));
_multipleNumberRecords = records;
_multipleNumberRecordIndex = 1;
return records[0];
}
return records[0];
}
if (record.getSid() == DrawingGroupRecord.sid
&& _lastRecord instanceof DrawingGroupRecord) {
DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) _lastRecord;
lastDGRecord.join((AbstractEscherHolderRecord) record);
return null;
}
if (record.getSid() == ContinueRecord.sid) {
ContinueRecord contRec = (ContinueRecord) record;
if (record.getSid() == DrawingGroupRecord.sid
&& lastRecord instanceof DrawingGroupRecord) {
DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) lastRecord;
lastDGRecord.join((AbstractEscherHolderRecord) record);
return null;
} else if (record.getSid() == ContinueRecord.sid) {
ContinueRecord contRec = (ContinueRecord) record;
if (lastRecord instanceof ObjRecord || lastRecord instanceof TextObjectRecord) {
// Drawing records have a very strange continue behaviour.
//There can actually be OBJ records mixed between the continues.
lastDrawingRecord.processContinueRecord(contRec.getData());
//we must remember the position of the continue record.
//in the serialization procedure the original structure of records must be preserved
if (includeContinueRecords) {
return record;
} else {
return null;
}
} else if (lastRecord instanceof DrawingGroupRecord) {
((DrawingGroupRecord) lastRecord).processContinueRecord(contRec.getData());
return null;
} else if (lastRecord instanceof DrawingRecord) {
((DrawingRecord) lastRecord).processContinueRecord(contRec.getData());
return null;
} else if (lastRecord instanceof UnknownRecord) {
//Gracefully handle records that we don't know about,
//that happen to be continued
return record;
} else if (lastRecord instanceof EOFRecord) {
// This is really odd, but excel still sometimes
// outputs a file like this all the same
return record;
} else {
throw new RecordFormatException("Unhandled Continue Record");
}
} else {
lastRecord = record;
if (record instanceof DrawingRecord) {
lastDrawingRecord = (DrawingRecord) record;
}
return record;
}
} else {
// No more records
complete = true;
return null;
}
}
/**
* Return or not ContinueRecord in nextRecord
*/
public void setIncludeContinueRecords(boolean includeContinueRecords) {
this.includeContinueRecords = includeContinueRecords;
}
}
if (_lastRecord instanceof ObjRecord || _lastRecord instanceof TextObjectRecord) {
// Drawing records have a very strange continue behaviour.
//There can actually be OBJ records mixed between the continues.
_lastDrawingRecord.processContinueRecord(contRec.getData());
//we must remember the position of the continue record.
//in the serialization procedure the original structure of records must be preserved
if (_shouldIncludeContinueRecords) {
return record;
}
return null;
}
if (_lastRecord instanceof DrawingGroupRecord) {
((DrawingGroupRecord) _lastRecord).processContinueRecord(contRec.getData());
return null;
}
if (_lastRecord instanceof DrawingRecord) {
((DrawingRecord) _lastRecord).processContinueRecord(contRec.getData());
return null;
}
if (_lastRecord instanceof UnknownRecord) {
//Gracefully handle records that we don't know about,
//that happen to be continued
return record;
}
if (_lastRecord instanceof EOFRecord) {
// This is really odd, but excel still sometimes
// outputs a file like this all the same
return record;
}
throw new RecordFormatException("Unhandled Continue Record");
}
_lastRecord = record;
if (record instanceof DrawingRecord) {
_lastDrawingRecord = (DrawingRecord) record;
}
return record;
}
}