From 15ffa9ebda510bad2a9bd9b0526ac82ead1cf050 Mon Sep 17 00:00:00 2001 From: Yegor Kozlov Date: Sun, 5 Jul 2009 14:10:49 +0000 Subject: [PATCH] refactored HSSFEventFactory to use RecordFactory instead of HSSFRecordStream, see Bugzilla 47448 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@791251 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 1 + .../hssf/eventusermodel/HSSFEventFactory.java | 2 +- .../hssf/eventusermodel/HSSFRecordStream.java | 234 ------------------ .../apache/poi/hssf/record/RecordFactory.java | 126 +--------- .../hssf/record/RecordFactoryInputStream.java | 233 +++++++++++++++++ 5 files changed, 248 insertions(+), 348 deletions(-) delete mode 100644 src/java/org/apache/poi/hssf/eventusermodel/HSSFRecordStream.java create mode 100755 src/java/org/apache/poi/hssf/record/RecordFactoryInputStream.java diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 3fcb9fedc..816028b3f 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -33,6 +33,7 @@ + 47448 - Allow HSSFEventFactory to handle non-zero padding at the end of the workbook stream 47456 - Support for getting OLE object data in PowerPointExtractor 47411 - Explicitly set the 1900 date system when creating XSSF workbooks 47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF diff --git a/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java b/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java index f8e64c928..2238b39b8 100644 --- a/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java +++ b/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java @@ -134,7 +134,7 @@ public class HSSFEventFactory Record r = null; // Create a new RecordStream and use that - HSSFRecordStream recordStream = new HSSFRecordStream(in); + RecordFactoryInputStream recordStream = new RecordFactoryInputStream(in); // Process each record as they come in while(going) { diff --git a/src/java/org/apache/poi/hssf/eventusermodel/HSSFRecordStream.java b/src/java/org/apache/poi/hssf/eventusermodel/HSSFRecordStream.java deleted file mode 100644 index feb7a36d5..000000000 --- a/src/java/org/apache/poi/hssf/eventusermodel/HSSFRecordStream.java +++ /dev/null @@ -1,234 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hssf.eventusermodel; - -import java.util.Vector; - -import org.apache.poi.hssf.record.ContinueRecord; -import org.apache.poi.hssf.record.DrawingGroupRecord; -import org.apache.poi.hssf.record.DrawingRecord; -import org.apache.poi.hssf.record.ObjRecord; -import org.apache.poi.hssf.record.Record; -import org.apache.poi.hssf.record.RecordFactory; -import org.apache.poi.hssf.record.RecordFormatException; -import org.apache.poi.hssf.record.RecordInputStream; -import org.apache.poi.hssf.record.TextObjectRecord; -import org.apache.poi.hssf.record.UnknownRecord; - -/** - * A stream based way to get at complete records, with - * as low a memory footprint as possible. - * This handles reading from a RecordInputStream, turning - * the data into full records, processing continue records - * etc. - * Most users should use {@link HSSFEventFactory} / - * {@link HSSFListener} and have new records pushed to - * them, but this does allow for a "pull" style of coding. - */ -public class HSSFRecordStream { - private RecordInputStream in; - - /** Have we run out of records on the stream? */ - private boolean hitEOS = false; - /** Have we returned all the records there are? */ - private boolean complete = false; - - /** - * Sometimes we end up with a bunch of - * records. When we do, these should - * be returned before the next normal - * record processing occurs (i.e. before - * we check for continue records and - * return rec) - */ - private Vector bonusRecords = null; - - /** - * The next record to return, which may need to have its - * continue records passed to it before we do - */ - private Record rec = null; - /** - * The most recent record that we gave to the user - */ - private Record lastRec = null; - /** - * The most recent DrawingRecord seen - */ - private DrawingRecord lastDrawingRecord = new DrawingRecord(); - - public HSSFRecordStream(RecordInputStream inp) { - this.in = inp; - } - - /** - * Returns the next (complete) record from the - * stream, or null if there are no more. - */ - public Record nextRecord() { - Record r = null; - - // Loop until we get something - while(r == null && !complete) { - // Are there any bonus records that we need to - // return? - r = getBonusRecord(); - - // If not, ask for the next real record - if(r == null) { - r = getNextRecord(); - } - } - - // All done - return r; - } - - /** - * If there are any "bonus" records, that should - * be returned before processing new ones, - * grabs the next and returns it. - * If not, returns null; - */ - private Record getBonusRecord() { - if(bonusRecords != null) { - Record r = (Record)bonusRecords.remove(0); - if(bonusRecords.size() == 0) { - bonusRecords = null; - } - return r; - } - return null; - } - - /** - * Returns the next available record, or null if - * this pass didn't return a record that's - * suitable for returning (eg was a continue record). - */ - private Record getNextRecord() { - Record toReturn = null; - - if(in.hasNextRecord()) { - // Grab our next record - in.nextRecord(); - short sid = in.getSid(); - - // - // for some reasons we have to make the workbook to be at least 4096 bytes - // but if we have such workbook we fill the end of it with zeros (many zeros) - // - // it is not good: - // if the length( all zero records ) % 4 = 1 - // e.g.: any zero record would be readed as 4 bytes at once ( 2 - id and 2 - size ). - // And the last 1 byte will be readed WRONG ( the id must be 2 bytes ) - // - // So we should better to check if the sid is zero and not to read more data - // The zero sid shows us that rest of the stream data is a fake to make workbook - // certain size - // - if ( sid == 0 ) - return null; - - - // If we had a last record, and this one - // isn't a continue record, then pass - // it on to the listener - if ((rec != null) && (sid != ContinueRecord.sid)) - { - // This last record ought to be returned - toReturn = rec; - } - - // If this record isn't a continue record, - // then build it up - if (sid != ContinueRecord.sid) - { - //System.out.println("creating "+sid); - Record[] recs = RecordFactory.createRecord(in); - - // We know that the multiple record situations - // don't contain continue records, so just - // pass those on to the listener now - if (recs.length > 1) { - bonusRecords = new Vector(recs.length-1); - for (int k = 0; k < (recs.length - 1); k++) { - bonusRecords.add(recs[k]); - } - } - - // Regardless of the number we created, always hold - // onto the last record to be processed on the next - // loop, in case it has any continue records - rec = recs[ recs.length - 1 ]; - // Don't return it just yet though, as we probably have - // a record from the last round to return - } - else { - // Normally, ContinueRecords are handled internally - // However, in a few cases, there is a gap between a record at - // its Continue, so we have to handle them specially - // This logic is much like in RecordFactory.createRecords() - Record[] recs = RecordFactory.createRecord(in); - ContinueRecord crec = (ContinueRecord)recs[0]; - if((lastRec instanceof ObjRecord) || (lastRec instanceof TextObjectRecord)) { - // You can have Obj records between a DrawingRecord - // and its continue! - lastDrawingRecord.processContinueRecord( crec.getData() ); - // Trigger them on the drawing record, now it's complete - rec = lastDrawingRecord; - } - else if((lastRec instanceof DrawingGroupRecord)) { - ((DrawingGroupRecord)lastRec).processContinueRecord(crec.getData()); - // Trigger them on the drawing record, now it's complete - rec = lastRec; - } - else { - if (rec instanceof UnknownRecord) { - ;//silently skip records we don't know about - } else { - throw new RecordFormatException("Records should handle ContinueRecord internally. Should not see this exception"); - } - } - } - - // Update our tracking of the last record - lastRec = rec; - if(rec instanceof DrawingRecord) { - lastDrawingRecord = (DrawingRecord)rec; - } - } else { - // No more records - hitEOS = true; - } - - // If we've hit the end-of-stream, then - // finish off the last record and be done - if(hitEOS) { - complete = true; - - // Return the last record if there was - // one, otherwise null - if(rec != null) { - toReturn = rec; - rec = null; - } - } - - return toReturn; - } -} \ No newline at end of file diff --git a/src/java/org/apache/poi/hssf/record/RecordFactory.java b/src/java/org/apache/poi/hssf/record/RecordFactory.java index 1f4ba9f95..53213dfc1 100644 --- a/src/java/org/apache/poi/hssf/record/RecordFactory.java +++ b/src/java/org/apache/poi/hssf/record/RecordFactory.java @@ -17,21 +17,14 @@ package org.apache.poi.hssf.record; +import org.apache.poi.hssf.record.chart.*; +import org.apache.poi.hssf.record.pivottable.*; + import java.io.InputStream; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Modifier; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.poi.hssf.record.chart.*; -import org.apache.poi.hssf.record.pivottable.*; +import java.util.*; /** * Title: Record Factory

@@ -259,7 +252,7 @@ public final class RecordFactory { return new Record[] { record, }; } - static Record createSingleRecord(RecordInputStream in) { + public static Record createSingleRecord(RecordInputStream in) { I_RecordCreator constructor = _recordCreatorsById.get(new Integer(in.getSid())); if (constructor == null) { @@ -273,7 +266,7 @@ public final class RecordFactory { * RK record is a slightly smaller alternative to NumberRecord * POI likes NumberRecord better */ - private static NumberRecord convertToNumberRecord(RKRecord rk) { + public static NumberRecord convertToNumberRecord(RKRecord rk) { NumberRecord num = new NumberRecord(); num.setColumn(rk.getColumn()); @@ -286,7 +279,7 @@ public final class RecordFactory { /** * Converts a {@link MulRKRecord} into an equivalent array of {@link NumberRecord}s */ - private static NumberRecord[] convertRKRecords(MulRKRecord mrk) { + public static NumberRecord[] convertRKRecords(MulRKRecord mrk) { NumberRecord[] mulRecs = new NumberRecord[mrk.getNumColumns()]; for (int k = 0; k < mrk.getNumColumns(); k++) { @@ -374,109 +367,16 @@ public final class RecordFactory { * @exception RecordFormatException on error processing the InputStream */ public static List createRecords(InputStream in) throws RecordFormatException { - List records = new ArrayList(NUM_RECORDS); - RecordInputStream recStream = new RecordInputStream(in); - DrawingRecord lastDrawingRecord = new DrawingRecord( ); - Record lastRecord = null; - /* - * How to recognise end of stream? - * In the best case, the underlying input stream (in) ends just after the last EOF record - * Usually however, the stream is padded with an arbitrary byte count. Excel and most apps - * reliably use zeros for padding and if this were always the case, this code could just - * skip all the (zero sized) records with sid==0. However, bug 46987 shows a file with - * non-zero padding that is read OK by Excel (Excel also fixes the padding). - * - * So to properly detect the workbook end of stream, this code has to identify the last - * EOF record. This is not so easy because the worbook bof+eof pair do not bracket the - * whole stream. The worksheets follow the workbook, but it is not easy to tell how many - * sheet sub-streams should be present. Hence we are looking for an EOF record that is not - * immediately followed by a BOF record. One extra complication is that bof+eof sub- - * streams can be nested within worksheet streams and it's not clear in these cases what - * record might follow any EOF record. So we also need to keep track of the bof/eof - * nesting level. - */ + RecordFactoryInputStream recStream = new RecordFactoryInputStream(new RecordInputStream(in)); + recStream.setIncludeContinueRecords(true); - int bofDepth=0; - boolean lastRecordWasEOFLevelZero = false; - while (recStream.hasNextRecord()) { - recStream.nextRecord(); - if (lastRecordWasEOFLevelZero && recStream.getSid() != BOFRecord.sid) { - // Normally InputStream (in) contains only zero padding after this point - break; - } - Record record = createSingleRecord(recStream); - lastRecordWasEOFLevelZero = false; - if (record instanceof BOFRecord) { - bofDepth++; - records.add(record); - continue; - } - if (record instanceof EOFRecord) { - bofDepth--; - records.add(record); - if (bofDepth<1) { - lastRecordWasEOFLevelZero = true; - } - continue; - } - - if (record instanceof DBCellRecord) { - // Not needed by POI. Regenerated from scratch by POI when spreadsheet is written - continue; - } - - if (record instanceof RKRecord) { - records.add(convertToNumberRecord((RKRecord) record)); - continue; - } - if (record instanceof MulRKRecord) { - addAll(records, convertRKRecords((MulRKRecord)record)); - continue; - } - - if (record.getSid() == DrawingGroupRecord.sid - && lastRecord instanceof DrawingGroupRecord) { - DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) lastRecord; - lastDGRecord.join((AbstractEscherHolderRecord) record); - } else if (record.getSid() == ContinueRecord.sid) { - ContinueRecord contRec = (ContinueRecord)record; - - if (lastRecord instanceof ObjRecord || lastRecord instanceof TextObjectRecord) { - // Drawing records have a very strange continue behaviour. - //There can actually be OBJ records mixed between the continues. - lastDrawingRecord.processContinueRecord(contRec.getData() ); - //we must remember the position of the continue record. - //in the serialization procedure the original structure of records must be preserved - records.add(record); - } else if (lastRecord instanceof DrawingGroupRecord) { - ((DrawingGroupRecord)lastRecord).processContinueRecord(contRec.getData()); - } else if (lastRecord instanceof UnknownRecord) { - //Gracefully handle records that we don't know about, - //that happen to be continued - records.add(record); - } else if (lastRecord instanceof EOFRecord) { - // This is really odd, but excel still sometimes - // outputs a file like this all the same - records.add(record); - } else { - throw new RecordFormatException("Unhandled Continue Record"); - } - } else { - lastRecord = record; - if (record instanceof DrawingRecord) { - lastDrawingRecord = (DrawingRecord) record; - } - records.add(record); - } + Record record; + while ((record = recStream.nextRecord())!=null) { + records.add(record); } + return records; } - - private static void addAll(List destList, Record[] srcRecs) { - for (int i = 0; i < srcRecs.length; i++) { - destList.add(srcRecs[i]); - } - } } diff --git a/src/java/org/apache/poi/hssf/record/RecordFactoryInputStream.java b/src/java/org/apache/poi/hssf/record/RecordFactoryInputStream.java new file mode 100755 index 000000000..285a49094 --- /dev/null +++ b/src/java/org/apache/poi/hssf/record/RecordFactoryInputStream.java @@ -0,0 +1,233 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hssf.record; + +import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; +import org.apache.poi.hssf.eventusermodel.HSSFListener; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * A stream based way to get at complete records, with + * as low a memory footprint as possible. + * This handles reading from a RecordInputStream, turning + * the data into full records, processing continue records + * etc. + * Most users should use {@link HSSFEventFactory} / + * {@link HSSFListener} and have new records pushed to + * them, but this does allow for a "pull" style of coding. + */ +public class RecordFactoryInputStream { + private final RecordInputStream recStream; + + /** + * Have we returned all the records there are? + */ + private boolean complete = false; + + /** + * Sometimes we end up with a bunch of + * records. When we do, these should + * be returned before the next normal + * record processing occurs (i.e. before + * we check for continue records and + * return rec) + */ + private final LinkedList bonusRecords = new LinkedList(); + + /** + * The most recent record that we gave to the user + */ + private Record lastRecord = null; + /** + * The most recent DrawingRecord seen + */ + private DrawingRecord lastDrawingRecord = new DrawingRecord(); + + private int bofDepth = 0; + + private boolean lastRecordWasEOFLevelZero = false; + + private boolean includeContinueRecords = false; + + public RecordFactoryInputStream(RecordInputStream inp) { + recStream = inp; + } + + /** + * Returns the next (complete) record from the + * stream, or null if there are no more. + */ + public Record nextRecord() { + Record r = null; + + // Loop until we get something + while (r == null && !complete) { + // Are there any bonus records that we need to + // return? + r = getBonusRecord(); + + // If not, ask for the next real record + if (r == null) { + r = getNextRecord(); + } + } + + // All done + return r; + } + + /** + * If there are any "bonus" records, that should + * be returned before processing new ones, + * grabs the next and returns it. + * If not, returns null; + */ + private Record getBonusRecord() { + if (!bonusRecords.isEmpty()) { + return (Record) bonusRecords.removeFirst(); + } + return null; + } + + /** + * Returns the next available record, or null if + * this pass didn't return a record that's + * suitable for returning (eg was a continue record). + */ + private Record getNextRecord() { + /* + * How to recognise end of stream? + * In the best case, the underlying input stream (in) ends just after the last EOF record + * Usually however, the stream is padded with an arbitrary byte count. Excel and most apps + * reliably use zeros for padding and if this were always the case, this code could just + * skip all the (zero sized) records with sid==0. However, bug 46987 shows a file with + * non-zero padding that is read OK by Excel (Excel also fixes the padding). + * + * So to properly detect the workbook end of stream, this code has to identify the last + * EOF record. This is not so easy because the worbook bof+eof pair do not bracket the + * whole stream. The worksheets follow the workbook, but it is not easy to tell how many + * sheet sub-streams should be present. Hence we are looking for an EOF record that is not + * immediately followed by a BOF record. One extra complication is that bof+eof sub- + * streams can be nested within worksheet streams and it's not clear in these cases what + * record might follow any EOF record. So we also need to keep track of the bof/eof + * nesting level. + */ + + if (recStream.hasNextRecord()) { + // Grab our next record + recStream.nextRecord(); + + if (lastRecordWasEOFLevelZero && recStream.getSid() != BOFRecord.sid) { + // Normally InputStream (in) contains only zero padding after this point + complete = true; + return null; + } + + Record record = RecordFactory.createSingleRecord(recStream); + lastRecordWasEOFLevelZero = false; + + if (record instanceof BOFRecord) { + bofDepth++; + return record; + } + + if (record instanceof EOFRecord) { + bofDepth--; + if (bofDepth < 1) { + lastRecordWasEOFLevelZero = true; + } + + return record; + } + + if (record instanceof DBCellRecord) { + // Not needed by POI. Regenerated from scratch by POI when spreadsheet is written + return null; + } + + if (record instanceof RKRecord) { + return RecordFactory.convertToNumberRecord((RKRecord) record); + } + + if (record instanceof MulRKRecord) { + NumberRecord[] records = RecordFactory.convertRKRecords((MulRKRecord) record); + + List list = Arrays.asList(records); + bonusRecords.addAll(list.subList(1, list.size())); + + return records[0]; + } + + if (record.getSid() == DrawingGroupRecord.sid + && lastRecord instanceof DrawingGroupRecord) { + DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) lastRecord; + lastDGRecord.join((AbstractEscherHolderRecord) record); + return null; + } else if (record.getSid() == ContinueRecord.sid) { + ContinueRecord contRec = (ContinueRecord) record; + + if (lastRecord instanceof ObjRecord || lastRecord instanceof TextObjectRecord) { + // Drawing records have a very strange continue behaviour. + //There can actually be OBJ records mixed between the continues. + lastDrawingRecord.processContinueRecord(contRec.getData()); + //we must remember the position of the continue record. + //in the serialization procedure the original structure of records must be preserved + if (includeContinueRecords) { + return record; + } else { + return null; + } + } else if (lastRecord instanceof DrawingGroupRecord) { + ((DrawingGroupRecord) lastRecord).processContinueRecord(contRec.getData()); + return null; + } else if (lastRecord instanceof UnknownRecord) { + //Gracefully handle records that we don't know about, + //that happen to be continued + return record; + } else if (lastRecord instanceof EOFRecord) { + // This is really odd, but excel still sometimes + // outputs a file like this all the same + return record; + } else { + throw new RecordFormatException("Unhandled Continue Record"); + } + } else { + lastRecord = record; + if (record instanceof DrawingRecord) { + lastDrawingRecord = (DrawingRecord) record; + } + + return record; + } + + } else { + // No more records + complete = true; + return null; + } + } + + /** + * Return or not ContinueRecord in nextRecord + */ + public void setIncludeContinueRecords(boolean includeContinueRecords) { + this.includeContinueRecords = includeContinueRecords; + } +} \ No newline at end of file