2010-06-30 11:13:10 -04:00
|
|
|
/* ====================================================================
|
|
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
|
|
this work for additional information regarding copyright ownership.
|
|
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
(the "License"); you may not use this file except in compliance with
|
|
|
|
the License. You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
==================================================================== */
|
|
|
|
package org.apache.poi.hwpf;
|
|
|
|
|
2016-07-20 18:35:51 -04:00
|
|
|
import java.io.File;
|
2010-06-30 11:13:10 -04:00
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.OutputStream;
|
2017-04-03 22:06:46 -04:00
|
|
|
import java.nio.charset.Charset;
|
2010-06-30 11:13:10 -04:00
|
|
|
|
2017-04-03 22:06:46 -04:00
|
|
|
import org.apache.poi.hwmf.record.HwmfFont;
|
2010-06-30 11:13:10 -04:00
|
|
|
import org.apache.poi.hwpf.model.ComplexFileTable;
|
2017-04-03 22:06:46 -04:00
|
|
|
import org.apache.poi.hwpf.model.FontTable;
|
2010-06-30 11:13:10 -04:00
|
|
|
import org.apache.poi.hwpf.model.OldCHPBinTable;
|
2017-04-03 22:06:46 -04:00
|
|
|
import org.apache.poi.hwpf.model.OldComplexFileTable;
|
|
|
|
import org.apache.poi.hwpf.model.OldFfn;
|
|
|
|
import org.apache.poi.hwpf.model.OldFontTable;
|
2010-07-02 16:59:30 -04:00
|
|
|
import org.apache.poi.hwpf.model.OldPAPBinTable;
|
|
|
|
import org.apache.poi.hwpf.model.OldSectionTable;
|
2017-04-03 22:06:46 -04:00
|
|
|
import org.apache.poi.hwpf.model.OldTextPieceTable;
|
2010-06-30 11:13:10 -04:00
|
|
|
import org.apache.poi.hwpf.model.PieceDescriptor;
|
|
|
|
import org.apache.poi.hwpf.model.TextPiece;
|
|
|
|
import org.apache.poi.hwpf.model.TextPieceTable;
|
2010-07-02 16:59:30 -04:00
|
|
|
import org.apache.poi.hwpf.usermodel.Range;
|
2010-06-30 11:13:10 -04:00
|
|
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
|
|
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
2017-04-03 22:06:46 -04:00
|
|
|
import org.apache.poi.util.CodePageUtil;
|
2010-06-30 11:13:10 -04:00
|
|
|
import org.apache.poi.util.LittleEndian;
|
2017-04-03 22:06:46 -04:00
|
|
|
import org.apache.poi.util.NotImplemented;
|
|
|
|
import org.apache.poi.util.StringUtil;
|
2010-06-30 11:13:10 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Provides very simple support for old (Word 6 / Word 95)
|
|
|
|
* files.
|
|
|
|
*/
|
|
|
|
public class HWPFOldDocument extends HWPFDocumentCore {
|
2017-04-03 22:06:46 -04:00
|
|
|
|
|
|
|
private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252;
|
|
|
|
|
|
|
|
private OldTextPieceTable tpt;
|
2010-06-30 11:13:10 -04:00
|
|
|
|
2011-07-25 08:58:09 -04:00
|
|
|
private StringBuilder _text;
|
2017-04-03 22:06:46 -04:00
|
|
|
|
|
|
|
private final OldFontTable fontTable;
|
|
|
|
private final Charset guessedCharset;
|
2011-07-25 08:58:09 -04:00
|
|
|
|
2010-06-30 11:13:10 -04:00
|
|
|
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
|
2010-12-28 22:19:46 -05:00
|
|
|
this(fs.getRoot());
|
2010-06-30 11:13:10 -04:00
|
|
|
}
|
|
|
|
|
2010-12-28 22:19:46 -05:00
|
|
|
public HWPFOldDocument(DirectoryNode directory)
|
|
|
|
throws IOException {
|
|
|
|
super(directory);
|
2010-06-30 11:13:10 -04:00
|
|
|
|
|
|
|
// Where are things?
|
2010-07-02 16:59:30 -04:00
|
|
|
int sedTableOffset = LittleEndian.getInt(_mainStream, 0x88);
|
|
|
|
int sedTableSize = LittleEndian.getInt(_mainStream, 0x8c);
|
2010-06-30 11:13:10 -04:00
|
|
|
int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
|
2010-07-02 16:59:30 -04:00
|
|
|
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
|
|
|
|
int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
|
|
|
|
int papTableSize = LittleEndian.getInt(_mainStream, 0xc4);
|
2017-04-03 22:06:46 -04:00
|
|
|
int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0);
|
|
|
|
int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4);
|
|
|
|
|
|
|
|
fontTable = new OldFontTable(_mainStream, fontTableOffset, fontTableSize);
|
|
|
|
//TODO: figure out how to map runs/text pieces to fonts
|
|
|
|
//for now, if there's a non standard codepage in one of the fonts
|
|
|
|
//assume that the doc is in that codepage.
|
|
|
|
guessedCharset = guessCodePage(fontTable);
|
|
|
|
|
2010-06-30 11:13:10 -04:00
|
|
|
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
|
|
|
|
|
|
|
|
// We need to get hold of the text that makes up the
|
|
|
|
// document, which might be regular or fast-saved
|
2011-10-30 04:59:16 -04:00
|
|
|
ComplexFileTable cft = null;
|
2011-10-01 11:32:32 -04:00
|
|
|
if(_fib.getFibBase().isFComplex()) {
|
2017-04-03 22:06:46 -04:00
|
|
|
cft = new OldComplexFileTable(
|
2010-06-30 11:13:10 -04:00
|
|
|
_mainStream, _mainStream,
|
2017-04-03 22:06:46 -04:00
|
|
|
complexTableOffset, _fib.getFibBase().getFcMin(), guessedCharset
|
2010-06-30 11:13:10 -04:00
|
|
|
);
|
2017-04-03 22:06:46 -04:00
|
|
|
tpt = (OldTextPieceTable)cft.getTextPieceTable();
|
2010-06-30 11:13:10 -04:00
|
|
|
|
|
|
|
} else {
|
2010-07-02 16:59:30 -04:00
|
|
|
// TODO Discover if these older documents can ever hold Unicode Strings?
|
|
|
|
// (We think not, because they seem to lack a Piece table)
|
2010-06-30 11:13:10 -04:00
|
|
|
// TODO Build the Piece Descriptor properly
|
2010-07-02 16:59:30 -04:00
|
|
|
// (We have to fake it, as they don't seem to have a proper Piece table)
|
2017-04-03 22:06:46 -04:00
|
|
|
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset);
|
2011-10-01 11:32:32 -04:00
|
|
|
pd.setFilePosition(_fib.getFibBase().getFcMin());
|
2010-06-30 11:13:10 -04:00
|
|
|
|
2010-07-02 16:59:30 -04:00
|
|
|
// Generate a single Text Piece Table, with a single Text Piece
|
|
|
|
// which covers all the (8 bit only) text in the file
|
2017-04-03 22:06:46 -04:00
|
|
|
tpt = new OldTextPieceTable();
|
2011-10-01 11:32:32 -04:00
|
|
|
byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
|
|
|
|
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
|
2017-04-03 22:06:46 -04:00
|
|
|
|
|
|
|
int numChars = textData.length;
|
2017-04-04 21:45:55 -04:00
|
|
|
if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
|
2017-04-03 22:06:46 -04:00
|
|
|
numChars /= 2;
|
|
|
|
}
|
|
|
|
|
2010-06-30 11:13:10 -04:00
|
|
|
TextPiece tp = new TextPiece(
|
2017-04-03 22:06:46 -04:00
|
|
|
0, numChars, textData, pd
|
2010-06-30 11:13:10 -04:00
|
|
|
);
|
2010-07-05 08:56:02 -04:00
|
|
|
tpt.add(tp);
|
2010-06-30 11:13:10 -04:00
|
|
|
|
|
|
|
}
|
2011-07-25 08:58:09 -04:00
|
|
|
_text = tpt.getText();
|
|
|
|
|
2010-06-30 11:13:10 -04:00
|
|
|
// Now we can fetch the character and paragraph properties
|
2010-07-02 16:59:30 -04:00
|
|
|
_cbt = new OldCHPBinTable(
|
2010-06-30 11:13:10 -04:00
|
|
|
_mainStream, chpTableOffset, chpTableSize,
|
2011-10-01 11:32:32 -04:00
|
|
|
_fib.getFibBase().getFcMin(), tpt
|
2010-06-30 11:13:10 -04:00
|
|
|
);
|
2010-07-02 16:59:30 -04:00
|
|
|
_pbt = new OldPAPBinTable(
|
|
|
|
_mainStream, papTableOffset, papTableSize,
|
2011-10-01 11:32:32 -04:00
|
|
|
_fib.getFibBase().getFcMin(), tpt
|
2010-07-02 16:59:30 -04:00
|
|
|
);
|
|
|
|
_st = new OldSectionTable(
|
|
|
|
_mainStream, sedTableOffset, sedTableSize,
|
2011-10-01 11:32:32 -04:00
|
|
|
_fib.getFibBase().getFcMin(), tpt
|
2010-07-02 16:59:30 -04:00
|
|
|
);
|
2011-10-30 04:59:16 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* in this mode we preserving PAPX/CHPX structure from file, so text may
|
|
|
|
* miss from output, and text order may be corrupted
|
|
|
|
*/
|
|
|
|
boolean preserveBinTables = false;
|
|
|
|
try
|
|
|
|
{
|
|
|
|
preserveBinTables = Boolean.parseBoolean( System
|
|
|
|
.getProperty( HWPFDocument.PROPERTY_PRESERVE_BIN_TABLES ) );
|
|
|
|
}
|
|
|
|
catch ( Exception exc )
|
|
|
|
{
|
|
|
|
// ignore;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( !preserveBinTables )
|
|
|
|
{
|
|
|
|
_cbt.rebuild( cft );
|
|
|
|
_pbt.rebuild( _text, cft );
|
|
|
|
}
|
2010-07-02 16:59:30 -04:00
|
|
|
}
|
2011-07-07 04:35:51 -04:00
|
|
|
|
2017-04-03 22:06:46 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Take the first codepage that is not default, ansi or symbol.
|
|
|
|
* Ideally, we'd want to track fonts with runs, but we don't yet
|
|
|
|
* know how to do that.
|
|
|
|
*
|
|
|
|
* Consider throwing an exception if > 1 unique codepage that is not default, symbol or ansi
|
|
|
|
* appears here.
|
|
|
|
*
|
|
|
|
* @param fontTable
|
|
|
|
* @return
|
|
|
|
*/
|
|
|
|
private Charset guessCodePage(OldFontTable fontTable) {
|
|
|
|
|
|
|
|
for (OldFfn oldFfn : fontTable.getFontNames()) {
|
|
|
|
HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff);
|
|
|
|
if (wmfCharset != null &&
|
|
|
|
wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET &&
|
|
|
|
wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET &&
|
|
|
|
wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) {
|
|
|
|
return wmfCharset.getCharset();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return DEFAULT_CHARSET;
|
|
|
|
}
|
|
|
|
|
2011-07-07 04:35:51 -04:00
|
|
|
public Range getOverallRange()
|
|
|
|
{
|
2010-07-02 16:59:30 -04:00
|
|
|
// Life is easy when we have no footers, headers or unicode!
|
2011-10-01 11:32:32 -04:00
|
|
|
return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this );
|
2011-07-07 04:35:51 -04:00
|
|
|
}
|
|
|
|
|
2017-04-03 22:06:46 -04:00
|
|
|
/**
|
|
|
|
* Use {@link #getOldFontTable()} instead!!!
|
|
|
|
* This always throws an IllegalArgumentException.
|
|
|
|
*
|
|
|
|
* @return nothing
|
|
|
|
* @throws UnsupportedOperationException
|
|
|
|
*/
|
|
|
|
@Override
|
|
|
|
@NotImplemented
|
|
|
|
public FontTable getFontTable() {
|
|
|
|
throw new UnsupportedOperationException("Use getOldFontTable instead.");
|
|
|
|
}
|
|
|
|
|
|
|
|
public OldFontTable getOldFontTable() {
|
|
|
|
return fontTable;
|
|
|
|
}
|
2011-07-07 04:35:51 -04:00
|
|
|
public Range getRange()
|
|
|
|
{
|
|
|
|
return getOverallRange();
|
2010-07-02 16:59:30 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
public TextPieceTable getTextTable()
|
|
|
|
{
|
|
|
|
return tpt;
|
2010-06-30 11:13:10 -04:00
|
|
|
}
|
|
|
|
|
2011-07-25 08:58:09 -04:00
|
|
|
@Override
|
|
|
|
public StringBuilder getText()
|
|
|
|
{
|
|
|
|
return _text;
|
|
|
|
}
|
|
|
|
|
2016-07-20 18:35:51 -04:00
|
|
|
@Override
|
|
|
|
public void write() throws IOException {
|
|
|
|
throw new IllegalStateException("Writing is not available for the older file formats");
|
|
|
|
}
|
|
|
|
@Override
|
|
|
|
public void write(File out) throws IOException {
|
|
|
|
throw new IllegalStateException("Writing is not available for the older file formats");
|
|
|
|
}
|
2010-06-30 11:13:10 -04:00
|
|
|
@Override
|
|
|
|
public void write(OutputStream out) throws IOException {
|
|
|
|
throw new IllegalStateException("Writing is not available for the older file formats");
|
|
|
|
}
|
2017-04-03 22:06:46 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* As a rough heuristic (total hack), read through the font table
|
|
|
|
* and take the first non-default, non-ansi, non-symbol
|
|
|
|
* font's charset and return that.
|
|
|
|
*
|
|
|
|
* Once we figure out how to link a font to a text piece, we should
|
|
|
|
* use the font information per text piece.
|
|
|
|
*
|
|
|
|
* @return charset
|
|
|
|
*/
|
|
|
|
public Charset getGuessedCharset() {
|
|
|
|
return guessedCharset;
|
|
|
|
}
|
|
|
|
|
2010-06-30 11:13:10 -04:00
|
|
|
}
|