373 lines
12 KiB
Java
373 lines
12 KiB
Java
/* ====================================================================
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
this work for additional information regarding copyright ownership.
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
(the "License"); you may not use this file except in compliance with
|
|
the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==================================================================== */
|
|
package org.apache.poi.hssf.record;
|
|
|
|
import java.io.InputStream;
|
|
import java.security.GeneralSecurityException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
import org.apache.poi.EncryptedDocumentException;
|
|
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
|
|
import org.apache.poi.hssf.eventusermodel.HSSFListener;
|
|
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
|
|
import org.apache.poi.poifs.crypt.Decryptor;
|
|
import org.apache.poi.poifs.crypt.EncryptionInfo;
|
|
|
|
/**
|
|
* A stream based way to get at complete records, with
|
|
* as low a memory footprint as possible.
|
|
* This handles reading from a RecordInputStream, turning
|
|
* the data into full records, processing continue records
|
|
* etc.
|
|
* Most users should use {@link HSSFEventFactory} /
|
|
* {@link HSSFListener} and have new records pushed to
|
|
* them, but this does allow for a "pull" style of coding.
|
|
*/
|
|
public final class RecordFactoryInputStream {
|
|
|
|
/**
|
|
* Keeps track of the sizes of the initial records up to and including {@link FilePassRecord}
|
|
* Needed for protected files because each byte is encrypted with respect to its absolute
|
|
* position from the start of the stream.
|
|
*/
|
|
private static final class StreamEncryptionInfo {
|
|
private final int _initialRecordsSize;
|
|
private final FilePassRecord _filePassRec;
|
|
private final Record _lastRecord;
|
|
private final boolean _hasBOFRecord;
|
|
|
|
public StreamEncryptionInfo(RecordInputStream rs, List<Record> outputRecs) {
|
|
Record rec;
|
|
rs.nextRecord();
|
|
int recSize = 4 + rs.remaining();
|
|
rec = RecordFactory.createSingleRecord(rs);
|
|
outputRecs.add(rec);
|
|
FilePassRecord fpr = null;
|
|
if (rec instanceof BOFRecord) {
|
|
_hasBOFRecord = true;
|
|
|
|
// Fetch the next record, and see if it indicates whether
|
|
// the document is encrypted or not
|
|
if (rs.hasNextRecord()) {
|
|
rs.nextRecord();
|
|
rec = RecordFactory.createSingleRecord(rs);
|
|
recSize += rec.getRecordSize();
|
|
outputRecs.add(rec);
|
|
|
|
// Encrypted is normally BOF then FILEPASS
|
|
// May sometimes be BOF, WRITEPROTECT, FILEPASS
|
|
if (rec instanceof WriteProtectRecord && rs.hasNextRecord()) {
|
|
rs.nextRecord();
|
|
rec = RecordFactory.createSingleRecord(rs);
|
|
recSize += rec.getRecordSize();
|
|
outputRecs.add(rec);
|
|
}
|
|
|
|
// If it's a FILEPASS, track it specifically but
|
|
// don't include it in the main stream
|
|
if (rec instanceof FilePassRecord) {
|
|
fpr = (FilePassRecord) rec;
|
|
outputRecs.remove(outputRecs.size()-1);
|
|
// TODO - add fpr not added to outputRecs
|
|
rec = outputRecs.get(0);
|
|
} else {
|
|
// workbook not encrypted (typical case)
|
|
if (rec instanceof EOFRecord) {
|
|
// A workbook stream is never empty, so crash instead
|
|
// of trying to keep track of nesting level
|
|
throw new IllegalStateException("Nothing between BOF and EOF");
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Invalid in a normal workbook stream.
|
|
// However, some test cases work on sub-sections of
|
|
// the workbook stream that do not begin with BOF
|
|
_hasBOFRecord = false;
|
|
}
|
|
_initialRecordsSize = recSize;
|
|
_filePassRec = fpr;
|
|
_lastRecord = rec;
|
|
}
|
|
|
|
public RecordInputStream createDecryptingStream(InputStream original) {
|
|
FilePassRecord fpr = _filePassRec;
|
|
String userPassword = Biff8EncryptionKey.getCurrentUserPassword();
|
|
if (userPassword == null) {
|
|
userPassword = Decryptor.DEFAULT_PASSWORD;
|
|
}
|
|
|
|
EncryptionInfo info = fpr.getEncryptionInfo();
|
|
try {
|
|
if (!info.getDecryptor().verifyPassword(userPassword)) {
|
|
throw new EncryptedDocumentException(
|
|
(Decryptor.DEFAULT_PASSWORD.equals(userPassword) ? "Default" : "Supplied")
|
|
+ " password is invalid for salt/verifier/verifierHash");
|
|
}
|
|
} catch (GeneralSecurityException e) {
|
|
throw new EncryptedDocumentException(e);
|
|
}
|
|
|
|
return new RecordInputStream(original, info, _initialRecordsSize);
|
|
}
|
|
|
|
public boolean hasEncryption() {
|
|
return _filePassRec != null;
|
|
}
|
|
|
|
/**
|
|
* @return last record scanned while looking for encryption info.
|
|
* This will typically be the first or second record read. Possibly <code>null</code>
|
|
* if stream was empty
|
|
*/
|
|
public Record getLastRecord() {
|
|
return _lastRecord;
|
|
}
|
|
|
|
/**
|
|
* <code>false</code> in some test cases
|
|
*/
|
|
public boolean hasBOFRecord() {
|
|
return _hasBOFRecord;
|
|
}
|
|
}
|
|
|
|
|
|
private final RecordInputStream _recStream;
|
|
private final boolean _shouldIncludeContinueRecords;
|
|
|
|
/**
|
|
* Temporarily stores a group of {@link Record}s, for future return by {@link #nextRecord()}.
|
|
* This is used at the start of the workbook stream, and also when the most recently read
|
|
* underlying record is a {@link MulRKRecord}
|
|
*/
|
|
private Record[] _unreadRecordBuffer;
|
|
|
|
/**
|
|
* used to help iterating over the unread records
|
|
*/
|
|
private int _unreadRecordIndex = -1;
|
|
|
|
/**
|
|
* The most recent record that we gave to the user
|
|
*/
|
|
private Record _lastRecord = null;
|
|
/**
|
|
* The most recent DrawingRecord seen
|
|
*/
|
|
private DrawingRecord _lastDrawingRecord = new DrawingRecord();
|
|
|
|
private int _bofDepth;
|
|
|
|
private boolean _lastRecordWasEOFLevelZero;
|
|
|
|
|
|
/**
|
|
* @param in the InputStream to read from
|
|
*
|
|
* @param shouldIncludeContinueRecords caller can pass <code>false</code> if loose
|
|
* {@link ContinueRecord}s should be skipped (this is sometimes useful in event based
|
|
* processing).
|
|
*/
|
|
public RecordFactoryInputStream(InputStream in, boolean shouldIncludeContinueRecords) {
|
|
RecordInputStream rs = new RecordInputStream(in);
|
|
List<Record> records = new ArrayList<Record>();
|
|
StreamEncryptionInfo sei = new StreamEncryptionInfo(rs, records);
|
|
if (sei.hasEncryption()) {
|
|
rs = sei.createDecryptingStream(in);
|
|
} else {
|
|
// typical case - non-encrypted stream
|
|
}
|
|
|
|
if (!records.isEmpty()) {
|
|
_unreadRecordBuffer = new Record[records.size()];
|
|
records.toArray(_unreadRecordBuffer);
|
|
_unreadRecordIndex =0;
|
|
}
|
|
_recStream = rs;
|
|
_shouldIncludeContinueRecords = shouldIncludeContinueRecords;
|
|
_lastRecord = sei.getLastRecord();
|
|
|
|
/*
|
|
* How to recognise end of stream?
|
|
* In the best case, the underlying input stream (in) ends just after the last EOF record
|
|
* Usually however, the stream is padded with an arbitrary byte count. Excel and most apps
|
|
* reliably use zeros for padding and if this were always the case, this code could just
|
|
* skip all the (zero sized) records with sid==0. However, bug 46987 shows a file with
|
|
* non-zero padding that is read OK by Excel (Excel also fixes the padding).
|
|
*
|
|
* So to properly detect the workbook end of stream, this code has to identify the last
|
|
* EOF record. This is not so easy because the worbook bof+eof pair do not bracket the
|
|
* whole stream. The worksheets follow the workbook, but it is not easy to tell how many
|
|
* sheet sub-streams should be present. Hence we are looking for an EOF record that is not
|
|
* immediately followed by a BOF record. One extra complication is that bof+eof sub-
|
|
* streams can be nested within worksheet streams and it's not clear in these cases what
|
|
* record might follow any EOF record. So we also need to keep track of the bof/eof
|
|
* nesting level.
|
|
*/
|
|
_bofDepth = sei.hasBOFRecord() ? 1 : 0;
|
|
_lastRecordWasEOFLevelZero = false;
|
|
}
|
|
|
|
/**
|
|
* @return the next (complete) record from the stream, or null if there are no more.
|
|
*/
|
|
public Record nextRecord() {
|
|
Record r;
|
|
r = getNextUnreadRecord();
|
|
if (r != null) {
|
|
// found an unread record
|
|
return r;
|
|
}
|
|
while (true) {
|
|
if (!_recStream.hasNextRecord()) {
|
|
// recStream is exhausted;
|
|
return null;
|
|
}
|
|
|
|
if (_lastRecordWasEOFLevelZero) {
|
|
// Potential place for ending the workbook stream
|
|
// Check that the next record is not BOFRecord(0x0809)
|
|
// Normally the input stream contains only zero padding after the last EOFRecord,
|
|
// but bug 46987 and 48068 suggests that the padding may be garbage.
|
|
// This code relies on the padding bytes not starting with BOFRecord.sid
|
|
if (_recStream.getNextSid() != BOFRecord.sid) {
|
|
return null;
|
|
}
|
|
// else - another sheet substream starting here
|
|
}
|
|
|
|
// step underlying RecordInputStream to the next record
|
|
_recStream.nextRecord();
|
|
|
|
r = readNextRecord();
|
|
if (r == null) {
|
|
// some record types may get skipped (e.g. DBCellRecord and ContinueRecord)
|
|
continue;
|
|
}
|
|
return r;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @return the next {@link Record} from the multiple record group as expanded from
|
|
* a recently read {@link MulRKRecord}. <code>null</code> if not present.
|
|
*/
|
|
private Record getNextUnreadRecord() {
|
|
if (_unreadRecordBuffer != null) {
|
|
int ix = _unreadRecordIndex;
|
|
if (ix < _unreadRecordBuffer.length) {
|
|
Record result = _unreadRecordBuffer[ix];
|
|
_unreadRecordIndex = ix + 1;
|
|
return result;
|
|
}
|
|
_unreadRecordIndex = -1;
|
|
_unreadRecordBuffer = null;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* @return the next available record, or <code>null</code> if
|
|
* this pass didn't return a record that's
|
|
* suitable for returning (eg was a continue record).
|
|
*/
|
|
private Record readNextRecord() {
|
|
|
|
Record record = RecordFactory.createSingleRecord(_recStream);
|
|
_lastRecordWasEOFLevelZero = false;
|
|
|
|
if (record instanceof BOFRecord) {
|
|
_bofDepth++;
|
|
return record;
|
|
}
|
|
|
|
if (record instanceof EOFRecord) {
|
|
_bofDepth--;
|
|
if (_bofDepth < 1) {
|
|
_lastRecordWasEOFLevelZero = true;
|
|
}
|
|
|
|
return record;
|
|
}
|
|
|
|
if (record instanceof DBCellRecord) {
|
|
// Not needed by POI. Regenerated from scratch by POI when spreadsheet is written
|
|
return null;
|
|
}
|
|
|
|
if (record instanceof RKRecord) {
|
|
return RecordFactory.convertToNumberRecord((RKRecord) record);
|
|
}
|
|
|
|
if (record instanceof MulRKRecord) {
|
|
Record[] records = RecordFactory.convertRKRecords((MulRKRecord) record);
|
|
|
|
_unreadRecordBuffer = records;
|
|
_unreadRecordIndex = 1;
|
|
return records[0];
|
|
}
|
|
|
|
if (record.getSid() == DrawingGroupRecord.sid
|
|
&& _lastRecord instanceof DrawingGroupRecord) {
|
|
DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) _lastRecord;
|
|
lastDGRecord.join((AbstractEscherHolderRecord) record);
|
|
return null;
|
|
}
|
|
if (record.getSid() == ContinueRecord.sid) {
|
|
ContinueRecord contRec = (ContinueRecord) record;
|
|
|
|
if (_lastRecord instanceof ObjRecord || _lastRecord instanceof TextObjectRecord) {
|
|
// Drawing records have a very strange continue behaviour.
|
|
//There can actually be OBJ records mixed between the continues.
|
|
_lastDrawingRecord.processContinueRecord(contRec.getData());
|
|
//we must remember the position of the continue record.
|
|
//in the serialization procedure the original structure of records must be preserved
|
|
if (_shouldIncludeContinueRecords) {
|
|
return record;
|
|
}
|
|
return null;
|
|
}
|
|
if (_lastRecord instanceof DrawingGroupRecord) {
|
|
((DrawingGroupRecord) _lastRecord).processContinueRecord(contRec.getData());
|
|
return null;
|
|
}
|
|
if (_lastRecord instanceof DrawingRecord) {
|
|
// ((DrawingRecord) _lastRecord).appendContinueRecord(contRec.getData());
|
|
return contRec;
|
|
}
|
|
if (_lastRecord instanceof UnknownRecord) {
|
|
//Gracefully handle records that we don't know about,
|
|
//that happen to be continued
|
|
return record;
|
|
}
|
|
if (_lastRecord instanceof EOFRecord) {
|
|
// This is really odd, but excel still sometimes
|
|
// outputs a file like this all the same
|
|
return record;
|
|
}
|
|
throw new RecordFormatException("Unhandled Continue Record followining " + _lastRecord.getClass());
|
|
}
|
|
_lastRecord = record;
|
|
if (record instanceof DrawingRecord) {
|
|
_lastDrawingRecord = (DrawingRecord) record;
|
|
}
|
|
return record;
|
|
}
|
|
}
|