poi/src/scratchpad/src/org/apache/poi/hdf/model/HDFObjectFactory.java

386 lines
12 KiB
Java

/*
* HDFObjectFactory.java
*
* Created on February 24, 2002, 2:17 PM
*/
package org.apache.poi.hdf.model;
//import java.io;
import java.util.ArrayList;
import java.io.InputStream;
import java.io.IOException;
import java.util.List;
import java.util.TreeSet;
import org.apache.poi.hdf.model.hdftypes.*;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSDocument;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.util.LittleEndian;
/**
* The Object Factory takes in a stream and creates the low level objects
* that represent the data.
* @author andy
*/
public class HDFObjectFactory
{
/** OLE stuff*/
private POIFSFileSystem _filesystem;
/** The FIB*/
private FileInformationBlock _fib;
/** The DOP*/
private DocumentProperties _dop;
/**the StyleSheet*/
private StyleSheet _styleSheet;
/**list info */
private ListTables _listTables;
/** Font info */
private FontTable _fonts;
/** text pieces */
//BTreeSet _text = new BTreeSet();
TreeSet _text = new TreeSet();
/** document sections */
TreeSet _sections = new TreeSet();
/** document paragraphs */
TreeSet _paragraphs = new TreeSet();
/** document character runs */
TreeSet _characterRuns = new TreeSet();
/** main document stream buffer*/
byte[] _mainDocument;
/** table stream buffer*/
byte[] _tableBuffer;
/** Creates a new instance of HDFObjectFactory
*
* @param istream The InputStream that is the Word document
*
*/
public HDFObjectFactory(InputStream istream) throws IOException
{
//do Ole stuff
_filesystem = new POIFSFileSystem(istream);
DocumentEntry headerProps =
(DocumentEntry)_filesystem.getRoot().getEntry("WordDocument");
_mainDocument = new byte[headerProps.getSize()];
_filesystem.createDocumentInputStream("WordDocument").read(_mainDocument);
_fib = new FileInformationBlock(_mainDocument);
initTableStream();
initTextPieces();
initFormattingProperties();
}
public static List getTypes(InputStream istream) throws IOException
{
List results = new ArrayList(1);
//do Ole stuff
POIFSFileSystem filesystem = new POIFSFileSystem(istream);
DocumentEntry headerProps =
(DocumentEntry)filesystem.getRoot().getEntry("WordDocument");
byte[] mainDocument = new byte[headerProps.getSize()];
filesystem.createDocumentInputStream("WordDocument").read(mainDocument);
FileInformationBlock fib = new FileInformationBlock(mainDocument);
// initTableStream();
// initTextPieces();
// initFormattingProperties();
results.add(fib);
return results;
}
/**
* Initializes the table stream
*
* @throws IOException
*/
private void initTableStream() throws IOException
{
String tablename = null;
if(_fib.isTabletype())
{
tablename="1Table";
}
else
{
tablename="0Table";
}
DocumentEntry tableEntry = (DocumentEntry)_filesystem.getRoot().getEntry(tablename);
//load the table stream into a buffer
int size = tableEntry.getSize();
_tableBuffer = new byte[size];
_filesystem.createDocumentInputStream(tablename).read(_tableBuffer);
}
/**
* Initializes the text pieces. Text is divided into pieces because some
* "pieces" may only contain unicode characters.
*
* @throws IOException
*/
private void initTextPieces() throws IOException
{
int pos = _fib.getTextPieceTableOffset();
//skips through the prms before we reach the piece table. These contain data
//for actual fast saved files
while (_tableBuffer[pos] == 1)
{
pos++;
int skip = LittleEndian.getShort(_tableBuffer, pos);
pos += 2 + skip;
}
if(_tableBuffer[pos] != 2)
{
throw new IOException("The text piece table is corrupted");
}
else
{
//parse out the text pieces
int pieceTableSize = LittleEndian.getInt(_tableBuffer, ++pos);
pos += 4;
int pieces = (pieceTableSize - 4) / 12;
for (int x = 0; x < pieces; x++)
{
int filePos = LittleEndian.getInt(_tableBuffer, pos + ((pieces + 1) * 4) + (x * 8) + 2);
boolean unicode = false;
if ((filePos & 0x40000000) == 0)
{
unicode = true;
}
else
{
unicode = false;
filePos &= ~(0x40000000);//gives me FC in doc stream
filePos /= 2;
}
int totLength = LittleEndian.getInt(_tableBuffer, pos + (x + 1) * 4) -
LittleEndian.getInt(_tableBuffer, pos + (x * 4));
TextPiece piece = new TextPiece(filePos, totLength, unicode);
_text.add(piece);
}
}
}
/**
* initializes all of the formatting properties for a Word Document
*/
private void initFormattingProperties()
{
createStyleSheet();
createListTables();
createFontTable();
initDocumentProperties();
initSectionProperties();
initCharacterProperties();
initParagraphProperties();
}
/**
* initializes the CharacterProperties BTree
*/
private void initCharacterProperties()
{
int charOffset = _fib.getChp_bin_table_offset();
int charPlcSize = _fib.getChp_bin_table_size();
int arraySize = (charPlcSize - 4)/8;
//first we must go through the bin table and find the fkps
for(int x = 0; x < arraySize; x++)
{
//get page number(has nothing to do with document page)
//containing the chpx for the paragraph
int PN = LittleEndian.getInt(_tableBuffer, charOffset + (4 * (arraySize + 1) + (4 * x)));
byte[] fkp = new byte[512];
System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512);
//take each fkp and get the chpxs
int crun = LittleEndian.getUnsignedByte(fkp, 511);
for(int y = 0; y < crun; y++)
{
//get the beginning fc of each paragraph text run
int fcStart = LittleEndian.getInt(fkp, y * 4);
int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4);
//get the offset in fkp of the papx for this paragraph
int chpxOffset = 2 * LittleEndian.getUnsignedByte(fkp, ((crun + 1) * 4) + y);
//optimization if offset == 0 use "Normal" style
if(chpxOffset == 0)
{
_characterRuns.add(new ChpxNode(fcStart, fcEnd, new byte[0]));
continue;
}
int size = LittleEndian.getUnsignedByte(fkp, chpxOffset);
byte[] chpx = new byte[size];
System.arraycopy(fkp, ++chpxOffset, chpx, 0, size);
//_papTable.put(new Integer(fcStart), papx);
_characterRuns.add(new ChpxNode(fcStart, fcEnd, chpx));
}
}
}
/**
* intializes the Paragraph Properties BTree
*/
private void initParagraphProperties()
{
//find paragraphs
int parOffset = _fib.getPap_bin_table_offset();
int parPlcSize = _fib.getPap_bin_table_size();
int arraySize = (parPlcSize - 4)/8;
//first we must go through the bin table and find the fkps
for(int x = 0; x < arraySize; x++)
{
int PN = LittleEndian.getInt(_tableBuffer, parOffset + (4 * (arraySize + 1) + (4 * x)));
byte[] fkp = new byte[512];
System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512);
//take each fkp and get the paps
int crun = LittleEndian.getUnsignedByte(fkp, 511);
for(int y = 0; y < crun; y++)
{
//get the beginning fc of each paragraph text run
int fcStart = LittleEndian.getInt(fkp, y * 4);
int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4);
//get the offset in fkp of the papx for this paragraph
int papxOffset = 2 * LittleEndian.getUnsignedByte(fkp, ((crun + 1) * 4) + (y * 13));
int size = 2 * LittleEndian.getUnsignedByte(fkp, papxOffset);
if(size == 0)
{
size = 2 * LittleEndian.getUnsignedByte(fkp, ++papxOffset);
}
else
{
size--;
}
byte[] papx = new byte[size];
System.arraycopy(fkp, ++papxOffset, papx, 0, size);
_paragraphs.add(new PapxNode(fcStart, fcEnd, papx));
}
}
}
/**
* initializes the SectionProperties BTree
*/
private void initSectionProperties()
{
//find sections
int fcMin = _fib.getOffsetFirstChar();
int plcfsedFC = _fib.getSectionPlcOffset();
int plcfsedSize = _fib.getSectionPlcSize();
byte[] plcfsed = new byte[plcfsedSize];
System.arraycopy(_tableBuffer, plcfsedFC, plcfsed, 0, plcfsedSize);
int arraySize = (plcfsedSize - 4)/16;
for(int x = 0; x < arraySize; x++)
{
int sectionStart = LittleEndian.getInt(plcfsed, x * 4) + fcMin;
int sectionEnd = LittleEndian.getInt(plcfsed, (x+1) * 4) + fcMin;
int sepxStart = LittleEndian.getInt(plcfsed, 4 * (arraySize + 1) + (x * 12) + 2);
int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart);
byte[] sepx = new byte[sepxSize];
System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize);
SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx);
_sections.add(node);
}
}
/**
* Initializes the DocumentProperties object unique to this document.
*/
private void initDocumentProperties()
{
int pos = _fib.getDOPOffset();
int size = _fib.getDOPSize();
byte[] dopArray = new byte[size];
System.arraycopy(_tableBuffer, pos, dopArray, 0, size);
_dop = new DocumentProperties(dopArray);
}
/**
* Uncompresses the StyleSheet from file into memory.
*/
private void createStyleSheet()
{
int stshIndex = _fib.getStylesheetOffset();
int stshSize = _fib.getStylesheetSize();
byte[] stsh = new byte[stshSize];
System.arraycopy(_tableBuffer, stshIndex, stsh, 0, stshSize);
_styleSheet = new StyleSheet(stsh);
}
/**
* Initializes the list tables for this document
*/
private void createListTables()
{
int lfoOffset = _fib.getFcPlfLfo();
int lfoSize = _fib.getLcbPlfLfo();
byte[] plflfo = new byte[lfoSize];
System.arraycopy(_tableBuffer, lfoOffset, plflfo, 0, lfoSize);
int lstOffset = _fib.getFcPlcfLst();
int lstSize = lstOffset;
//not sure if this is a mistake or what. I vaguely remember a trick like
//this
//int lstSize = LittleEndian.getInt(_header, 0x2e2);
if(lstOffset > 0 && lstSize > 0)
{
lstSize = lfoOffset - lstOffset;
byte[] plcflst = new byte[lstSize];
System.arraycopy(_tableBuffer, lstOffset, plcflst, 0, lstSize);
_listTables = new ListTables(plcflst, plflfo);
}
}
/**
* Initializes this document's FontTable;
*/
private void createFontTable()
{
int fontTableIndex = _fib.getFonts_bin_table_offset();
int fontTableSize = _fib.getFonts_bin_table_size();
byte[] fontTable = new byte[fontTableSize];
System.arraycopy(_tableBuffer, fontTableIndex, fontTable, 0, fontTableSize);
_fonts = new FontTable(fontTable);
}
}