bug 50955 -- word 6.0 charset fix
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790061 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c77f1ad3da
commit
af51ea4c10
@ -218,6 +218,9 @@ public class TestAllFiles {
|
|||||||
"document/Word6_sections2.doc",
|
"document/Word6_sections2.doc",
|
||||||
"document/Word95.doc",
|
"document/Word95.doc",
|
||||||
"document/word95err.doc",
|
"document/word95err.doc",
|
||||||
|
"document/Bug60936.doc",
|
||||||
|
"document/Bug60942.doc",
|
||||||
|
"document/Bug60942b.doc",
|
||||||
"hpsf/TestMickey.doc",
|
"hpsf/TestMickey.doc",
|
||||||
"document/52117.doc"
|
"document/52117.doc"
|
||||||
);
|
);
|
||||||
|
@ -18,6 +18,9 @@
|
|||||||
package org.apache.poi.util;
|
package org.apache.poi.util;
|
||||||
|
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utilities for working with Microsoft CodePages.
|
* Utilities for working with Microsoft CodePages.
|
||||||
@ -27,6 +30,13 @@ import java.io.UnsupportedEncodingException;
|
|||||||
*/
|
*/
|
||||||
public class CodePageUtil
|
public class CodePageUtil
|
||||||
{
|
{
|
||||||
|
|
||||||
|
public static final Set<Charset> VARIABLE_BYTE_CHARSETS = new HashSet<Charset>();
|
||||||
|
static {
|
||||||
|
//others?
|
||||||
|
VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5);
|
||||||
|
}
|
||||||
|
|
||||||
/** <p>Codepage 037, a special case</p> */
|
/** <p>Codepage 037, a special case</p> */
|
||||||
public static final int CP_037 = 37;
|
public static final int CP_037 = 37;
|
||||||
|
|
||||||
|
107
src/java/org/apache/poi/util/LittleEndianBig5Stream.java
Normal file
107
src/java/org/apache/poi/util/LittleEndianBig5Stream.java
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.util;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stream that converts MSOffice's way of storing Big5, with
|
||||||
|
* zero-byte padding for ASCII and in LittleEndianOrder.
|
||||||
|
*/
|
||||||
|
@Internal
|
||||||
|
public class LittleEndianBig5Stream extends ByteArrayInputStream {
|
||||||
|
private static final int EOF = -1;
|
||||||
|
private static final int INVALID_PAIR = -2;
|
||||||
|
private static final int EMPTY_TRAILING = -3;
|
||||||
|
|
||||||
|
//the char that is logically trailing in Big5 encoding
|
||||||
|
//however in LittleEndian order, this is the first encountered.
|
||||||
|
int trailing = EMPTY_TRAILING;
|
||||||
|
public LittleEndianBig5Stream(byte[] buf) {
|
||||||
|
super(buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
public LittleEndianBig5Stream(byte[] buf, int offset, int length) {
|
||||||
|
super(buf, offset, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read() {
|
||||||
|
|
||||||
|
if (trailing != EMPTY_TRAILING) {
|
||||||
|
int tmp = trailing;
|
||||||
|
trailing = EMPTY_TRAILING;
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
int leading = readNext();
|
||||||
|
while (leading == INVALID_PAIR) {
|
||||||
|
leading = readNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (leading == EOF) {
|
||||||
|
return EOF;
|
||||||
|
}
|
||||||
|
return leading;
|
||||||
|
}
|
||||||
|
|
||||||
|
//returns leading, sets trailing appropriately
|
||||||
|
//returns -1 if it hits the end of the stream
|
||||||
|
//returns -2 for an invalid big5 code pair
|
||||||
|
private final int readNext() {
|
||||||
|
trailing = super.read();
|
||||||
|
if (trailing == -1) {
|
||||||
|
return EOF;
|
||||||
|
}
|
||||||
|
int leading = super.read();
|
||||||
|
if (leading == EOF) {
|
||||||
|
return EOF;
|
||||||
|
}
|
||||||
|
int lead = leading&0xff;
|
||||||
|
if (lead > 0x80) {
|
||||||
|
return leading;
|
||||||
|
} else if (lead == 0) {
|
||||||
|
int ret = trailing;
|
||||||
|
trailing = EMPTY_TRAILING;
|
||||||
|
return ret;
|
||||||
|
} else {
|
||||||
|
int ret = trailing;
|
||||||
|
trailing = EMPTY_TRAILING;
|
||||||
|
return ret;
|
||||||
|
//return INVALID_PAIR;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read(byte[] buff, int off, int len) {
|
||||||
|
int bytesRead = 0;
|
||||||
|
for (int i = off; i < off+len; i++) {
|
||||||
|
int b = read();
|
||||||
|
if (b == -1) {
|
||||||
|
if (bytesRead == 0) {
|
||||||
|
return -1;
|
||||||
|
} else {
|
||||||
|
return bytesRead;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bytesRead++;
|
||||||
|
buff[i] = (byte)b;
|
||||||
|
}
|
||||||
|
return bytesRead;
|
||||||
|
}
|
||||||
|
}
|
@ -17,6 +17,8 @@
|
|||||||
|
|
||||||
package org.apache.poi.util;
|
package org.apache.poi.util;
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
@ -27,9 +29,14 @@ import java.util.Map;
|
|||||||
*/
|
*/
|
||||||
@Internal
|
@Internal
|
||||||
public class StringUtil {
|
public class StringUtil {
|
||||||
|
|
||||||
|
private static final POILogger logger = POILogFactory
|
||||||
|
.getLogger(StringUtil.class);
|
||||||
protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
|
protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
|
||||||
protected static final Charset UTF16LE = Charset.forName("UTF-16LE");
|
public static final Charset UTF16LE = Charset.forName("UTF-16LE");
|
||||||
public static final Charset UTF8 = Charset.forName("UTF-8");
|
public static final Charset UTF8 = Charset.forName("UTF-8");
|
||||||
|
public static final Charset WIN_1252 = Charset.forName("cp1252");
|
||||||
|
public static final Charset BIG5 = Charset.forName("Big5");
|
||||||
|
|
||||||
private static Map<Integer,Integer> msCodepointToUnicode;
|
private static Map<Integer,Integer> msCodepointToUnicode;
|
||||||
|
|
||||||
@ -573,7 +580,28 @@ public class StringUtil {
|
|||||||
9133, // 0xf0fe bracerightbt
|
9133, // 0xf0fe bracerightbt
|
||||||
' ', // 0xf0ff not defined
|
' ', // 0xf0ff not defined
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This tries to convert a LE byte array in Big5 to a String.
|
||||||
|
* We know MS zero-padded ascii, and we drop those.
|
||||||
|
* However, there may be areas for improvement in this.
|
||||||
|
*
|
||||||
|
* @param data
|
||||||
|
* @param offset
|
||||||
|
* @param lengthInBytes
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) {
|
||||||
|
ByteArrayOutputStream os = new ByteArrayOutputStream();
|
||||||
|
try {
|
||||||
|
IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os);
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.log(POILogger.WARN,
|
||||||
|
"IOException while copying a byte array stream to a byte array stream?!");
|
||||||
|
}
|
||||||
|
return new String(os.toByteArray(), BIG5);
|
||||||
|
}
|
||||||
|
|
||||||
// Could be replaced with org.apache.commons.lang3.StringUtils#join
|
// Could be replaced with org.apache.commons.lang3.StringUtils#join
|
||||||
@Internal
|
@Internal
|
||||||
public static String join(Object[] array, String separator) {
|
public static String join(Object[] array, String separator) {
|
||||||
|
@ -108,7 +108,7 @@ public class HwmfFont {
|
|||||||
return charset;
|
return charset;
|
||||||
}
|
}
|
||||||
|
|
||||||
static WmfCharset valueOf(int flag) {
|
public static WmfCharset valueOf(int flag) {
|
||||||
for (WmfCharset cs : values()) {
|
for (WmfCharset cs : values()) {
|
||||||
if (cs.flag == flag) return cs;
|
if (cs.flag == flag) return cs;
|
||||||
}
|
}
|
||||||
|
@ -19,27 +19,43 @@ package org.apache.poi.hwpf;
|
|||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
|
import org.apache.poi.hwmf.record.HwmfFont;
|
||||||
import org.apache.poi.hwpf.model.ComplexFileTable;
|
import org.apache.poi.hwpf.model.ComplexFileTable;
|
||||||
|
import org.apache.poi.hwpf.model.FontTable;
|
||||||
import org.apache.poi.hwpf.model.OldCHPBinTable;
|
import org.apache.poi.hwpf.model.OldCHPBinTable;
|
||||||
|
import org.apache.poi.hwpf.model.OldComplexFileTable;
|
||||||
|
import org.apache.poi.hwpf.model.OldFfn;
|
||||||
|
import org.apache.poi.hwpf.model.OldFontTable;
|
||||||
import org.apache.poi.hwpf.model.OldPAPBinTable;
|
import org.apache.poi.hwpf.model.OldPAPBinTable;
|
||||||
import org.apache.poi.hwpf.model.OldSectionTable;
|
import org.apache.poi.hwpf.model.OldSectionTable;
|
||||||
|
import org.apache.poi.hwpf.model.OldTextPieceTable;
|
||||||
import org.apache.poi.hwpf.model.PieceDescriptor;
|
import org.apache.poi.hwpf.model.PieceDescriptor;
|
||||||
import org.apache.poi.hwpf.model.TextPiece;
|
import org.apache.poi.hwpf.model.TextPiece;
|
||||||
import org.apache.poi.hwpf.model.TextPieceTable;
|
import org.apache.poi.hwpf.model.TextPieceTable;
|
||||||
import org.apache.poi.hwpf.usermodel.Range;
|
import org.apache.poi.hwpf.usermodel.Range;
|
||||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.poi.util.CodePageUtil;
|
||||||
import org.apache.poi.util.LittleEndian;
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
import org.apache.poi.util.NotImplemented;
|
||||||
|
import org.apache.poi.util.StringUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides very simple support for old (Word 6 / Word 95)
|
* Provides very simple support for old (Word 6 / Word 95)
|
||||||
* files.
|
* files.
|
||||||
*/
|
*/
|
||||||
public class HWPFOldDocument extends HWPFDocumentCore {
|
public class HWPFOldDocument extends HWPFDocumentCore {
|
||||||
private TextPieceTable tpt;
|
|
||||||
|
private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252;
|
||||||
|
|
||||||
|
private OldTextPieceTable tpt;
|
||||||
|
|
||||||
private StringBuilder _text;
|
private StringBuilder _text;
|
||||||
|
|
||||||
|
private final OldFontTable fontTable;
|
||||||
|
private final Charset guessedCharset;
|
||||||
|
|
||||||
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
|
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
|
||||||
this(fs.getRoot());
|
this(fs.getRoot());
|
||||||
@ -56,45 +72,52 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
|||||||
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
|
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
|
||||||
int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
|
int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
|
||||||
int papTableSize = LittleEndian.getInt(_mainStream, 0xc4);
|
int papTableSize = LittleEndian.getInt(_mainStream, 0xc4);
|
||||||
//int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60);
|
int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0);
|
||||||
//int shfTableSize = LittleEndian.getInt(_mainStream, 0x64);
|
int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4);
|
||||||
|
|
||||||
|
fontTable = new OldFontTable(_mainStream, fontTableOffset, fontTableSize);
|
||||||
|
//TODO: figure out how to map runs/text pieces to fonts
|
||||||
|
//for now, if there's a non standard codepage in one of the fonts
|
||||||
|
//assume that the doc is in that codepage.
|
||||||
|
guessedCharset = guessCodePage(fontTable);
|
||||||
|
|
||||||
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
|
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
|
||||||
|
|
||||||
// We need to get hold of the text that makes up the
|
// We need to get hold of the text that makes up the
|
||||||
// document, which might be regular or fast-saved
|
// document, which might be regular or fast-saved
|
||||||
ComplexFileTable cft = null;
|
ComplexFileTable cft = null;
|
||||||
StringBuffer text = new StringBuffer();
|
|
||||||
if(_fib.getFibBase().isFComplex()) {
|
if(_fib.getFibBase().isFComplex()) {
|
||||||
cft = new ComplexFileTable(
|
cft = new OldComplexFileTable(
|
||||||
_mainStream, _mainStream,
|
_mainStream, _mainStream,
|
||||||
complexTableOffset, _fib.getFibBase().getFcMin()
|
complexTableOffset, _fib.getFibBase().getFcMin(), guessedCharset
|
||||||
);
|
);
|
||||||
tpt = cft.getTextPieceTable();
|
tpt = (OldTextPieceTable)cft.getTextPieceTable();
|
||||||
|
|
||||||
for(TextPiece tp : tpt.getTextPieces()) {
|
|
||||||
text.append( tp.getStringBuilder() );
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// TODO Discover if these older documents can ever hold Unicode Strings?
|
// TODO Discover if these older documents can ever hold Unicode Strings?
|
||||||
// (We think not, because they seem to lack a Piece table)
|
// (We think not, because they seem to lack a Piece table)
|
||||||
// TODO Build the Piece Descriptor properly
|
// TODO Build the Piece Descriptor properly
|
||||||
// (We have to fake it, as they don't seem to have a proper Piece table)
|
// (We have to fake it, as they don't seem to have a proper Piece table)
|
||||||
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
|
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset);
|
||||||
pd.setFilePosition(_fib.getFibBase().getFcMin());
|
pd.setFilePosition(_fib.getFibBase().getFcMin());
|
||||||
|
|
||||||
// Generate a single Text Piece Table, with a single Text Piece
|
// Generate a single Text Piece Table, with a single Text Piece
|
||||||
// which covers all the (8 bit only) text in the file
|
// which covers all the (8 bit only) text in the file
|
||||||
tpt = new TextPieceTable();
|
tpt = new OldTextPieceTable();
|
||||||
byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
|
byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
|
||||||
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
|
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
|
||||||
|
|
||||||
|
int numChars = textData.length;
|
||||||
|
if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) {
|
||||||
|
numChars /= 2;
|
||||||
|
}
|
||||||
|
|
||||||
TextPiece tp = new TextPiece(
|
TextPiece tp = new TextPiece(
|
||||||
0, textData.length, textData, pd
|
0, numChars, textData, pd
|
||||||
);
|
);
|
||||||
tpt.add(tp);
|
tpt.add(tp);
|
||||||
|
|
||||||
text.append(tp.getStringBuilder());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_text = tpt.getText();
|
_text = tpt.getText();
|
||||||
|
|
||||||
// Now we can fetch the character and paragraph properties
|
// Now we can fetch the character and paragraph properties
|
||||||
@ -133,12 +156,54 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Take the first codepage that is not default, ansi or symbol.
|
||||||
|
* Ideally, we'd want to track fonts with runs, but we don't yet
|
||||||
|
* know how to do that.
|
||||||
|
*
|
||||||
|
* Consider throwing an exception if > 1 unique codepage that is not default, symbol or ansi
|
||||||
|
* appears here.
|
||||||
|
*
|
||||||
|
* @param fontTable
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private Charset guessCodePage(OldFontTable fontTable) {
|
||||||
|
|
||||||
|
for (OldFfn oldFfn : fontTable.getFontNames()) {
|
||||||
|
HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff);
|
||||||
|
if (wmfCharset != null &&
|
||||||
|
wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET &&
|
||||||
|
wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET &&
|
||||||
|
wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) {
|
||||||
|
return wmfCharset.getCharset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return DEFAULT_CHARSET;
|
||||||
|
}
|
||||||
|
|
||||||
public Range getOverallRange()
|
public Range getOverallRange()
|
||||||
{
|
{
|
||||||
// Life is easy when we have no footers, headers or unicode!
|
// Life is easy when we have no footers, headers or unicode!
|
||||||
return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this );
|
return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use {@link #getOldFontTable()} instead!!!
|
||||||
|
* This always throws an IllegalArgumentException.
|
||||||
|
*
|
||||||
|
* @return nothing
|
||||||
|
* @throws UnsupportedOperationException
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
@NotImplemented
|
||||||
|
public FontTable getFontTable() {
|
||||||
|
throw new UnsupportedOperationException("Use getOldFontTable instead.");
|
||||||
|
}
|
||||||
|
|
||||||
|
public OldFontTable getOldFontTable() {
|
||||||
|
return fontTable;
|
||||||
|
}
|
||||||
public Range getRange()
|
public Range getRange()
|
||||||
{
|
{
|
||||||
return getOverallRange();
|
return getOverallRange();
|
||||||
@ -167,4 +232,19 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
|||||||
public void write(OutputStream out) throws IOException {
|
public void write(OutputStream out) throws IOException {
|
||||||
throw new IllegalStateException("Writing is not available for the older file formats");
|
throw new IllegalStateException("Writing is not available for the older file formats");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* As a rough heuristic (total hack), read through the font table
|
||||||
|
* and take the first non-default, non-ansi, non-symbol
|
||||||
|
* font's charset and return that.
|
||||||
|
*
|
||||||
|
* Once we figure out how to link a font to a text piece, we should
|
||||||
|
* use the font information per text piece.
|
||||||
|
*
|
||||||
|
* @return charset
|
||||||
|
*/
|
||||||
|
public Charset getGuessedCharset() {
|
||||||
|
return guessedCharset;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
package org.apache.poi.hwpf.model;
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ -26,9 +27,10 @@ import org.apache.poi.hwpf.model.io.HWPFOutputStream;
|
|||||||
import org.apache.poi.hwpf.sprm.SprmBuffer;
|
import org.apache.poi.hwpf.sprm.SprmBuffer;
|
||||||
import org.apache.poi.util.Internal;
|
import org.apache.poi.util.Internal;
|
||||||
import org.apache.poi.util.LittleEndian;
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
import org.apache.poi.util.StringUtil;
|
||||||
|
|
||||||
@Internal
|
@Internal
|
||||||
public final class ComplexFileTable {
|
public class ComplexFileTable {
|
||||||
private static final byte GRPPRL_TYPE = 1;
|
private static final byte GRPPRL_TYPE = 1;
|
||||||
private static final byte TEXT_PIECE_TABLE_TYPE = 2;
|
private static final byte TEXT_PIECE_TABLE_TYPE = 2;
|
||||||
|
|
||||||
@ -40,7 +42,8 @@ public final class ComplexFileTable {
|
|||||||
_tpt = new TextPieceTable();
|
_tpt = new TextPieceTable();
|
||||||
}
|
}
|
||||||
|
|
||||||
public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException {
|
protected ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin,
|
||||||
|
Charset charset) throws IOException {
|
||||||
//skips through the prms before we reach the piece table. These contain data
|
//skips through the prms before we reach the piece table. These contain data
|
||||||
//for actual fast saved files
|
//for actual fast saved files
|
||||||
List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>();
|
List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>();
|
||||||
@ -61,7 +64,12 @@ public final class ComplexFileTable {
|
|||||||
}
|
}
|
||||||
int pieceTableSize = LittleEndian.getInt(tableStream, ++offset);
|
int pieceTableSize = LittleEndian.getInt(tableStream, ++offset);
|
||||||
offset += LittleEndian.INT_SIZE;
|
offset += LittleEndian.INT_SIZE;
|
||||||
_tpt = new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin);
|
_tpt = newTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException {
|
||||||
|
this(documentStream, tableStream, offset, fcMin, StringUtil.WIN_1252);
|
||||||
}
|
}
|
||||||
|
|
||||||
public TextPieceTable getTextPieceTable() {
|
public TextPieceTable getTextPieceTable() {
|
||||||
@ -92,4 +100,11 @@ public final class ComplexFileTable {
|
|||||||
tableStream.write(table);
|
tableStream.write(table);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected TextPieceTable newTextPieceTable(byte[] documentStream,
|
||||||
|
byte[] tableStream, int offset, int pieceTableSize, int fcMin,
|
||||||
|
Charset charset) {
|
||||||
|
return new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -44,7 +44,7 @@ public final class OldCHPBinTable extends CHPBinTable
|
|||||||
* @param fcMin
|
* @param fcMin
|
||||||
*/
|
*/
|
||||||
public OldCHPBinTable(byte[] documentStream, int offset,
|
public OldCHPBinTable(byte[] documentStream, int offset,
|
||||||
int size, int fcMin, TextPieceTable tpt)
|
int size, int fcMin, OldTextPieceTable tpt)
|
||||||
{
|
{
|
||||||
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
|
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
|
||||||
|
|
||||||
|
@ -0,0 +1,42 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
|
import org.apache.poi.util.Internal;
|
||||||
|
|
||||||
|
@Internal
|
||||||
|
public final class OldComplexFileTable extends ComplexFileTable {
|
||||||
|
|
||||||
|
public OldComplexFileTable(byte[] documentStream, byte[] tableStream,
|
||||||
|
int offset, int fcMin, Charset charset) throws IOException {
|
||||||
|
super(documentStream, tableStream, offset, fcMin, charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected TextPieceTable newTextPieceTable(byte[] documentStream,
|
||||||
|
byte[] tableStream, int offset,
|
||||||
|
int pieceTableSize, int fcMin, Charset charset) {
|
||||||
|
return new OldTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
161
src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java
Normal file
161
src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
|
import org.apache.poi.hwmf.record.HwmfFont;
|
||||||
|
import org.apache.poi.util.Internal;
|
||||||
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
import org.apache.poi.util.POILogFactory;
|
||||||
|
import org.apache.poi.util.POILogger;
|
||||||
|
import org.apache.poi.util.StringUtil;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Word 6.0 Font information
|
||||||
|
*/
|
||||||
|
@Internal
|
||||||
|
public final class OldFfn {
|
||||||
|
|
||||||
|
private static final POILogger logger = POILogFactory.getLogger(OldFfn.class);
|
||||||
|
|
||||||
|
private byte _chs;// character set identifier
|
||||||
|
|
||||||
|
private final String fontName;
|
||||||
|
private final String altFontName;
|
||||||
|
|
||||||
|
private final int length; //length in bytes for this record
|
||||||
|
|
||||||
|
/**
|
||||||
|
* try to read an OldFfn starting at offset; read no farther than end
|
||||||
|
*
|
||||||
|
* @param buf buffer from which to read
|
||||||
|
* @param offset offset at which to start
|
||||||
|
* @param fontTableEnd read no farther than this
|
||||||
|
* @return an OldFfn or null if asked to read beyond end
|
||||||
|
*/
|
||||||
|
static OldFfn build(byte[] buf, int offset, int fontTableEnd) {
|
||||||
|
int start = offset;
|
||||||
|
//preliminary bytes
|
||||||
|
if (offset + 6 > fontTableEnd) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
//first byte
|
||||||
|
short fontDescriptionLength = (short) buf[offset];
|
||||||
|
offset += 1;
|
||||||
|
if (offset + fontDescriptionLength > fontTableEnd) {
|
||||||
|
logger.log(POILogger.WARN, "Asked to read beyond font table end. Skipping font");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
//no idea what these 3 bytes do
|
||||||
|
offset += 3;
|
||||||
|
byte chs = buf[offset];
|
||||||
|
Charset charset = null;
|
||||||
|
HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(chs & 0xff);
|
||||||
|
if (wmfCharset == null) {
|
||||||
|
logger.log(POILogger.WARN, "Couldn't find font for type: " + (chs & 0xff));
|
||||||
|
} else {
|
||||||
|
charset = wmfCharset.getCharset();
|
||||||
|
}
|
||||||
|
charset = charset == null ? StringUtil.WIN_1252 : charset;
|
||||||
|
offset += LittleEndian.BYTE_SIZE;
|
||||||
|
//if this byte here == 7, it _may_ signify existence of
|
||||||
|
//an altername font name
|
||||||
|
|
||||||
|
//not sure what the byte after the _chs does
|
||||||
|
offset += LittleEndian.BYTE_SIZE;
|
||||||
|
int fontNameLength = -1;
|
||||||
|
for (int i = offset; i < fontTableEnd; i++) {
|
||||||
|
if (buf[i] == 0) {
|
||||||
|
fontNameLength = i - offset;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (fontNameLength == -1) {
|
||||||
|
logger.log(POILogger.WARN, "Couldn't find the zero-byte delimited font name length");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
String fontName = new String(buf, offset, fontNameLength, charset);
|
||||||
|
String altFontName = null;
|
||||||
|
int altFontNameLength = -1;
|
||||||
|
offset += fontNameLength + 1;
|
||||||
|
if (offset - start < fontDescriptionLength) {
|
||||||
|
for (int i = offset; i <= start + fontDescriptionLength; i++) {
|
||||||
|
if (buf[i] == 0) {
|
||||||
|
altFontNameLength = i - offset;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (altFontNameLength > -1) {
|
||||||
|
altFontName = new String(buf, offset, altFontNameLength, charset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//reset to 0 for length calculation
|
||||||
|
altFontNameLength = (altFontNameLength < 0) ? 0 : altFontNameLength + 1;//add one for zero byte
|
||||||
|
|
||||||
|
int len = LittleEndian.INT_SIZE + LittleEndian.BYTE_SIZE + LittleEndian.BYTE_SIZE +//6 starting bytes
|
||||||
|
fontNameLength + altFontNameLength + 1;//+1 is for the zero byte
|
||||||
|
//this len should == fontDescriptionLength
|
||||||
|
|
||||||
|
return new OldFfn(chs, fontName, altFontName, len);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public OldFfn(byte charsetIdentifier, String fontName, String altFontName, int length) {
|
||||||
|
this._chs = charsetIdentifier;
|
||||||
|
this.fontName = fontName;
|
||||||
|
this.altFontName = altFontName;
|
||||||
|
this.length = length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte getChs() {
|
||||||
|
return _chs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getMainFontName() {
|
||||||
|
return fontName;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return altFontName if it exists, null otherwise
|
||||||
|
*/
|
||||||
|
public String getAltFontName() {
|
||||||
|
return altFontName;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return length in bytes for this record
|
||||||
|
*/
|
||||||
|
public int getLength() {
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "OldFfn{" +
|
||||||
|
"_chs=" + (_chs & 0xff) +
|
||||||
|
", fontName='" + fontName + '\'' +
|
||||||
|
", altFontName='" + altFontName + '\'' +
|
||||||
|
", length=" + length +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -0,0 +1,84 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.poi.util.Internal;
|
||||||
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
import org.apache.poi.util.POILogFactory;
|
||||||
|
import org.apache.poi.util.POILogger;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Font table for Word 6.0
|
||||||
|
*/
|
||||||
|
@Internal
|
||||||
|
public final class OldFontTable {
|
||||||
|
private final static POILogger _logger = POILogFactory.getLogger(OldFontTable.class);
|
||||||
|
|
||||||
|
// added extra facilitator members
|
||||||
|
// FFN structure containing strings of font names
|
||||||
|
private final OldFfn[] _fontNames;
|
||||||
|
|
||||||
|
public OldFontTable(byte[] buf, int offset, int length) {
|
||||||
|
//length is stored at the index section in the table
|
||||||
|
//and it is recorded in the first short.
|
||||||
|
|
||||||
|
|
||||||
|
List<OldFfn> ffns = new ArrayList<OldFfn>();
|
||||||
|
int fontTableLength = LittleEndian.getShort(buf, offset);
|
||||||
|
|
||||||
|
int endOfTableOffset = offset + length;
|
||||||
|
int startOffset = offset + LittleEndian.SHORT_SIZE;//first short should == length!
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
OldFfn oldFfn = OldFfn.build(buf, startOffset, endOfTableOffset);
|
||||||
|
if (oldFfn == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
ffns.add(oldFfn);
|
||||||
|
startOffset += oldFfn.getLength();
|
||||||
|
|
||||||
|
}
|
||||||
|
_fontNames = ffns.toArray(new OldFfn[ffns.size()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public OldFfn[] getFontNames() {
|
||||||
|
return _fontNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String getMainFont(int chpFtc) {
|
||||||
|
if (chpFtc >= _fontNames.length) {
|
||||||
|
_logger.log(POILogger.INFO, "Mismatch in chpFtc with stringCount");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return _fontNames[chpFtc].getMainFontName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "OldFontTable{" +
|
||||||
|
"_fontNames=" + Arrays.toString(_fontNames) +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
}
|
120
src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java
Normal file
120
src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.poi.util.Internal;
|
||||||
|
import org.apache.poi.util.NotImplemented;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lightweight representation of a text piece.
|
||||||
|
* Works in the character domain, not the byte domain, so you
|
||||||
|
* need to have turned byte references into character
|
||||||
|
* references before getting here.
|
||||||
|
*/
|
||||||
|
@Internal
|
||||||
|
public class OldTextPiece extends TextPiece {
|
||||||
|
|
||||||
|
private final byte[] rawBytes;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param start Beginning offset in main document stream, in characters.
|
||||||
|
* @param end Ending offset in main document stream, in characters.
|
||||||
|
* @param text The raw bytes of our text
|
||||||
|
*/
|
||||||
|
public OldTextPiece(int start, int end, byte[] text, PieceDescriptor pd) {
|
||||||
|
super(start, end, text, pd);
|
||||||
|
this.rawBytes = text;
|
||||||
|
if (end < start) {
|
||||||
|
throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return nothing, ever. Always throws an UnsupportedOperationException
|
||||||
|
* @throws UnsupportedOperationException
|
||||||
|
*/
|
||||||
|
@NotImplemented
|
||||||
|
@Override
|
||||||
|
public boolean isUnicode() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public StringBuilder getStringBuilder() {
|
||||||
|
return (StringBuilder) _buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] getRawBytes() {
|
||||||
|
byte[] buf = new byte[rawBytes.length];
|
||||||
|
System.arraycopy(rawBytes, 0, buf, 0, rawBytes.length);
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns part of the string.
|
||||||
|
* Works only in characters, not in bytes!
|
||||||
|
*
|
||||||
|
* @param start Local start position, in characters
|
||||||
|
* @param end Local end position, in characters
|
||||||
|
* @throws UnsupportedOperationException
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
@NotImplemented
|
||||||
|
public String substring(int start, int end) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Not implemented for OldTextPiece.
|
||||||
|
* Always throws UnsupportedOperationException
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
@NotImplemented
|
||||||
|
public void adjustForDelete(int start, int length) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the length, in bytes
|
||||||
|
*/
|
||||||
|
public int bytesLength() {
|
||||||
|
return rawBytes.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
assert false : "hashCode not designed";
|
||||||
|
return 42; // any arbitrary constant will do
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the character position we start at.
|
||||||
|
*/
|
||||||
|
public int getCP() {
|
||||||
|
return getStart();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "OldTextPiece from " + getStart() + " to " + getEnd() + " ("
|
||||||
|
+ getPieceDescriptor() + ")";
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,119 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
|
||||||
|
import org.apache.poi.util.CodePageUtil;
|
||||||
|
import org.apache.poi.util.Internal;
|
||||||
|
import org.apache.poi.util.POILogFactory;
|
||||||
|
import org.apache.poi.util.POILogger;
|
||||||
|
|
||||||
|
|
||||||
|
@Internal
|
||||||
|
public class OldTextPieceTable extends TextPieceTable {
|
||||||
|
|
||||||
|
private static final POILogger logger = POILogFactory
|
||||||
|
.getLogger(OldTextPieceTable.class);
|
||||||
|
|
||||||
|
public OldTextPieceTable() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
public OldTextPieceTable(byte[] documentStream, byte[] tableStream,
|
||||||
|
int offset, int size, int fcMin, Charset charset) {
|
||||||
|
//super(documentStream, tableStream, offset, size, fcMin, charset);
|
||||||
|
// get our plex of PieceDescriptors
|
||||||
|
PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size,
|
||||||
|
PieceDescriptor.getSizeInBytes());
|
||||||
|
|
||||||
|
int length = pieceTable.length();
|
||||||
|
PieceDescriptor[] pieces = new PieceDescriptor[length];
|
||||||
|
|
||||||
|
// iterate through piece descriptors raw bytes and create
|
||||||
|
// PieceDescriptor objects
|
||||||
|
for (int x = 0; x < length; x++) {
|
||||||
|
GenericPropertyNode node = pieceTable.getProperty(x);
|
||||||
|
pieces[x] = new PieceDescriptor(node.getBytes(), 0, charset);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Figure out the cp of the earliest text piece
|
||||||
|
// Note that text pieces don't have to be stored in order!
|
||||||
|
_cpMin = pieces[0].getFilePosition() - fcMin;
|
||||||
|
for (PieceDescriptor piece : pieces) {
|
||||||
|
int start = piece.getFilePosition() - fcMin;
|
||||||
|
if (start < _cpMin) {
|
||||||
|
_cpMin = start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// using the PieceDescriptors, build our list of TextPieces.
|
||||||
|
for (int x = 0; x < pieces.length; x++) {
|
||||||
|
int start = pieces[x].getFilePosition();
|
||||||
|
GenericPropertyNode node = pieceTable.getProperty(x);
|
||||||
|
|
||||||
|
// Grab the start and end, which are in characters
|
||||||
|
int nodeStartChars = node.getStart();
|
||||||
|
int nodeEndChars = node.getEnd();
|
||||||
|
|
||||||
|
// What's the relationship between bytes and characters?
|
||||||
|
boolean unicode = pieces[x].isUnicode();
|
||||||
|
int multiple = 1;
|
||||||
|
if (unicode ||
|
||||||
|
(charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) {
|
||||||
|
multiple = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Figure out the length, in bytes and chars
|
||||||
|
int textSizeChars = (nodeEndChars - nodeStartChars);
|
||||||
|
int textSizeBytes = textSizeChars * multiple;
|
||||||
|
|
||||||
|
// Grab the data that makes up the piece
|
||||||
|
byte[] buf = new byte[textSizeBytes];
|
||||||
|
System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
|
||||||
|
|
||||||
|
// And now build the piece
|
||||||
|
final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf,
|
||||||
|
pieces[x]);
|
||||||
|
|
||||||
|
_textPieces.add(newTextPiece);
|
||||||
|
}
|
||||||
|
|
||||||
|
// In the interest of our sanity, now sort the text pieces
|
||||||
|
// into order, if they're not already
|
||||||
|
Collections.sort(_textPieces);
|
||||||
|
_textPiecesFCOrder = new ArrayList<TextPiece>(_textPieces);
|
||||||
|
Collections.sort(_textPiecesFCOrder, new FCComparator());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) {
|
||||||
|
return new OldTextPiece(nodeStartChars, nodeEndChars, buf, pd);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected int getEncodingMultiplier(TextPiece textPiece) {
|
||||||
|
Charset charset = textPiece.getPieceDescriptor().getCharset();
|
||||||
|
if (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
@ -260,7 +260,7 @@ public class PAPBinTable
|
|||||||
SprmBuffer sprmBuffer = null;
|
SprmBuffer sprmBuffer = null;
|
||||||
for ( PAPX papx : papxs )
|
for ( PAPX papx : papxs )
|
||||||
{
|
{
|
||||||
if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 )
|
if ( papx.getGrpprl() == null || papx.getGrpprl().length <= 2 )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if ( sprmBuffer == null ) {
|
if ( sprmBuffer == null ) {
|
||||||
|
@ -17,10 +17,13 @@
|
|||||||
|
|
||||||
package org.apache.poi.hwpf.model;
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
import org.apache.poi.util.BitField;
|
import org.apache.poi.util.BitField;
|
||||||
import org.apache.poi.util.BitFieldFactory;
|
import org.apache.poi.util.BitFieldFactory;
|
||||||
import org.apache.poi.util.Internal;
|
import org.apache.poi.util.Internal;
|
||||||
import org.apache.poi.util.LittleEndian;
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
import org.apache.poi.util.StringUtil;
|
||||||
|
|
||||||
@Internal
|
@Internal
|
||||||
public final class PieceDescriptor
|
public final class PieceDescriptor
|
||||||
@ -32,29 +35,51 @@ public final class PieceDescriptor
|
|||||||
private static BitField fCopied = BitFieldFactory.getInstance(0x04);
|
private static BitField fCopied = BitFieldFactory.getInstance(0x04);
|
||||||
int fc;
|
int fc;
|
||||||
PropertyModifier prm;
|
PropertyModifier prm;
|
||||||
boolean unicode;
|
boolean unicode = false;
|
||||||
|
private final Charset charset;
|
||||||
|
|
||||||
|
|
||||||
public PieceDescriptor(byte[] buf, int offset)
|
public PieceDescriptor(byte[] buf, int offset) {
|
||||||
{
|
this(buf, offset, null);
|
||||||
descriptor = LittleEndian.getShort(buf, offset);
|
|
||||||
offset += LittleEndian.SHORT_SIZE;
|
|
||||||
fc = LittleEndian.getInt(buf, offset);
|
|
||||||
offset += LittleEndian.INT_SIZE;
|
|
||||||
prm = new PropertyModifier( LittleEndian.getShort(buf, offset));
|
|
||||||
|
|
||||||
// see if this piece uses unicode.
|
|
||||||
if ((fc & 0x40000000) == 0)
|
|
||||||
{
|
|
||||||
unicode = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
unicode = false;
|
|
||||||
fc &= ~(0x40000000);//gives me FC in doc stream
|
|
||||||
fc /= 2;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* This initializer should only be used for HWPFOldDocuments.
|
||||||
|
*
|
||||||
|
* @param buf
|
||||||
|
* @param offset
|
||||||
|
* @param charset which charset to use if this is not unicode
|
||||||
|
*/
|
||||||
|
public PieceDescriptor(byte[] buf, int offset, Charset charset) {
|
||||||
|
descriptor = LittleEndian.getShort(buf, offset);
|
||||||
|
offset += LittleEndian.SHORT_SIZE;
|
||||||
|
fc = LittleEndian.getInt(buf, offset);
|
||||||
|
offset += LittleEndian.INT_SIZE;
|
||||||
|
prm = new PropertyModifier(LittleEndian.getShort(buf, offset));
|
||||||
|
if (charset == null) {
|
||||||
|
// see if this piece uses unicode.
|
||||||
|
//From the documentation: If the second most significant bit
|
||||||
|
//is clear, then this indicates the actual file offset of the Unicode character (two bytes). If the
|
||||||
|
//second most significant bit is set, then the actual address of the codepage-1252
|
||||||
|
//compressed version of the Unicode character (one byte), is actually at the offset indicated
|
||||||
|
//by clearing this bit and dividing by two.
|
||||||
|
if ((fc & 0x40000000) == 0) {
|
||||||
|
unicode = true;
|
||||||
|
this.charset = null;
|
||||||
|
} else {
|
||||||
|
unicode = false;
|
||||||
|
fc &= ~(0x40000000);//gives me FC in doc stream
|
||||||
|
fc /= 2;
|
||||||
|
this.charset = StringUtil.WIN_1252;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (charset == StringUtil.UTF16LE) {
|
||||||
|
unicode = true;
|
||||||
|
}
|
||||||
|
this.charset = charset;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getFilePosition()
|
public int getFilePosition()
|
||||||
@ -72,6 +97,15 @@ public final class PieceDescriptor
|
|||||||
return unicode;
|
return unicode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return charset to use if this is not a Unicode PieceDescriptor
|
||||||
|
* this can be <code>null</code>
|
||||||
|
*/
|
||||||
|
public Charset getCharset() {
|
||||||
|
return charset;
|
||||||
|
}
|
||||||
|
|
||||||
public PropertyModifier getPrm()
|
public PropertyModifier getPrm()
|
||||||
{
|
{
|
||||||
return prm;
|
return prm;
|
||||||
|
@ -21,6 +21,7 @@ package org.apache.poi.hwpf.model;
|
|||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
import org.apache.poi.util.Internal;
|
import org.apache.poi.util.Internal;
|
||||||
|
import org.apache.poi.util.StringUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lightweight representation of a text piece.
|
* Lightweight representation of a text piece.
|
||||||
@ -40,7 +41,6 @@ public class TextPiece extends PropertyNode<TextPiece> {
|
|||||||
* @param start Beginning offset in main document stream, in characters.
|
* @param start Beginning offset in main document stream, in characters.
|
||||||
* @param end Ending offset in main document stream, in characters.
|
* @param end Ending offset in main document stream, in characters.
|
||||||
* @param text The raw bytes of our text
|
* @param text The raw bytes of our text
|
||||||
* @deprecated Use {@link #TextPiece(int, int, byte[], PieceDescriptor)}
|
|
||||||
* instead
|
* instead
|
||||||
*/
|
*/
|
||||||
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd,
|
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd,
|
||||||
@ -72,8 +72,13 @@ public class TextPiece extends PropertyNode<TextPiece> {
|
|||||||
* Create the StringBuilder from the text and unicode flag
|
* Create the StringBuilder from the text and unicode flag
|
||||||
*/
|
*/
|
||||||
private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
|
private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
|
||||||
String str = new String(text, Charset.forName(pd.isUnicode() ? "UTF-16LE" : "Cp1252"));
|
byte[] textBuffer = text;
|
||||||
|
if (StringUtil.BIG5.equals(pd.getCharset())) {
|
||||||
|
String txt = new StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, text.length)).toString();
|
||||||
|
return new StringBuilder(txt);
|
||||||
|
}
|
||||||
|
|
||||||
|
String str = new String(textBuffer, 0, textBuffer.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset());
|
||||||
return new StringBuilder(str);
|
return new StringBuilder(str);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -207,4 +212,5 @@ public class TextPiece extends PropertyNode<TextPiece> {
|
|||||||
return "TextPiece from " + getStart() + " to " + getEnd() + " ("
|
return "TextPiece from " + getStart() + " to " + getEnd() + " ("
|
||||||
+ getPieceDescriptor() + ")";
|
+ getPieceDescriptor() + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -101,7 +101,7 @@ public class TextPieceTable implements CharIndexTranslator {
|
|||||||
System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
|
System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
|
||||||
|
|
||||||
// And now build the piece
|
// And now build the piece
|
||||||
final TextPiece newTextPiece = new TextPiece(nodeStartChars, nodeEndChars, buf,
|
final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf,
|
||||||
pieces[x]);
|
pieces[x]);
|
||||||
|
|
||||||
_textPieces.add(newTextPiece);
|
_textPieces.add(newTextPiece);
|
||||||
@ -114,6 +114,10 @@ public class TextPieceTable implements CharIndexTranslator {
|
|||||||
Collections.sort(_textPiecesFCOrder, new FCComparator());
|
Collections.sort(_textPiecesFCOrder, new FCComparator());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) {
|
||||||
|
return new TextPiece(nodeStartChars, nodeEndChars, buf, pd);
|
||||||
|
}
|
||||||
|
|
||||||
public void add(TextPiece piece) {
|
public void add(TextPiece piece) {
|
||||||
_textPieces.add(piece);
|
_textPieces.add(piece);
|
||||||
_textPiecesFCOrder.add(piece);
|
_textPiecesFCOrder.add(piece);
|
||||||
@ -249,7 +253,7 @@ public class TextPieceTable implements CharIndexTranslator {
|
|||||||
if (rangeStartBytes > rangeEndBytes)
|
if (rangeStartBytes > rangeEndBytes)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
|
final int encodingMultiplier = getEncodingMultiplier(textPiece);
|
||||||
|
|
||||||
final int rangeStartCp = textPiece.getStart()
|
final int rangeStartCp = textPiece.getStart()
|
||||||
+ (rangeStartBytes - tpStart) / encodingMultiplier;
|
+ (rangeStartBytes - tpStart) / encodingMultiplier;
|
||||||
@ -262,6 +266,10 @@ public class TextPieceTable implements CharIndexTranslator {
|
|||||||
return result.toArray(new int[result.size()][]);
|
return result.toArray(new int[result.size()][]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected int getEncodingMultiplier(TextPiece textPiece) {
|
||||||
|
return textPiece.isUnicode() ? 2 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
public int getCpMin() {
|
public int getCpMin() {
|
||||||
return _cpMin;
|
return _cpMin;
|
||||||
}
|
}
|
||||||
@ -439,7 +447,7 @@ public class TextPieceTable implements CharIndexTranslator {
|
|||||||
return textPlex.toByteArray();
|
return textPlex.toByteArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class FCComparator implements Comparator<TextPiece>, Serializable {
|
protected static class FCComparator implements Comparator<TextPiece>, Serializable {
|
||||||
public int compare(TextPiece textPiece, TextPiece textPiece1) {
|
public int compare(TextPiece textPiece, TextPiece textPiece1) {
|
||||||
if (textPiece.getPieceDescriptor().fc > textPiece1
|
if (textPiece.getPieceDescriptor().fc > textPiece1
|
||||||
.getPieceDescriptor().fc) {
|
.getPieceDescriptor().fc) {
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
package org.apache.poi.hwpf.usermodel;
|
package org.apache.poi.hwpf.usermodel;
|
||||||
|
|
||||||
import org.apache.poi.hwpf.HWPFDocument;
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
import org.apache.poi.hwpf.HWPFOldDocument;
|
||||||
import org.apache.poi.hwpf.model.CHPX;
|
import org.apache.poi.hwpf.model.CHPX;
|
||||||
import org.apache.poi.hwpf.model.FFData;
|
import org.apache.poi.hwpf.model.FFData;
|
||||||
import org.apache.poi.hwpf.model.Ffn;
|
import org.apache.poi.hwpf.model.Ffn;
|
||||||
@ -438,6 +439,10 @@ public final class CharacterRun extends Range
|
|||||||
|
|
||||||
public String getFontName()
|
public String getFontName()
|
||||||
{
|
{
|
||||||
|
if (_doc instanceof HWPFOldDocument) {
|
||||||
|
return ((HWPFOldDocument) _doc).getOldFontTable().getMainFont(_props.getFtcAscii());
|
||||||
|
}
|
||||||
|
|
||||||
if (_doc.getFontTable() == null)
|
if (_doc.getFontTable() == null)
|
||||||
// old word format
|
// old word format
|
||||||
return null;
|
return null;
|
||||||
|
@ -16,18 +16,19 @@
|
|||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi.hwpf.converter;
|
package org.apache.poi.hwpf.converter;
|
||||||
|
|
||||||
import java.io.File;
|
import static org.junit.Assert.assertNotNull;
|
||||||
import java.io.FilenameFilter;
|
|
||||||
import java.io.StringWriter;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import javax.xml.transform.OutputKeys;
|
import javax.xml.transform.OutputKeys;
|
||||||
import javax.xml.transform.Transformer;
|
import javax.xml.transform.Transformer;
|
||||||
import javax.xml.transform.TransformerFactory;
|
import javax.xml.transform.TransformerFactory;
|
||||||
import javax.xml.transform.dom.DOMSource;
|
import javax.xml.transform.dom.DOMSource;
|
||||||
import javax.xml.transform.stream.StreamResult;
|
import javax.xml.transform.stream.StreamResult;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FilenameFilter;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.poi.POIDataSamples;
|
import org.apache.poi.POIDataSamples;
|
||||||
import org.apache.poi.hwpf.HWPFDocumentCore;
|
import org.apache.poi.hwpf.HWPFDocumentCore;
|
||||||
@ -36,8 +37,6 @@ import org.junit.Test;
|
|||||||
import org.junit.runner.RunWith;
|
import org.junit.runner.RunWith;
|
||||||
import org.junit.runners.Parameterized;
|
import org.junit.runners.Parameterized;
|
||||||
|
|
||||||
import static org.junit.Assert.assertNotNull;
|
|
||||||
|
|
||||||
@RunWith(Parameterized.class)
|
@RunWith(Parameterized.class)
|
||||||
public class TestWordToConverterSuite
|
public class TestWordToConverterSuite
|
||||||
{
|
{
|
||||||
@ -45,7 +44,11 @@ public class TestWordToConverterSuite
|
|||||||
* YK: a quick hack to exclude failing documents from the suite.
|
* YK: a quick hack to exclude failing documents from the suite.
|
||||||
*/
|
*/
|
||||||
private static List<String> failingFiles = Arrays
|
private static List<String> failingFiles = Arrays
|
||||||
.asList( "ProblemExtracting.doc" );
|
.asList( "ProblemExtracting.doc",
|
||||||
|
"Bug50955.doc" //basic extraction works,
|
||||||
|
// but these extractors modify the document,
|
||||||
|
// which is a no-go for this Word 6.0 file
|
||||||
|
);
|
||||||
|
|
||||||
@Parameterized.Parameters(name="{index}: {0}")
|
@Parameterized.Parameters(name="{index}: {0}")
|
||||||
public static Iterable<Object[]> files() {
|
public static Iterable<Object[]> files() {
|
||||||
|
@ -57,6 +57,7 @@ import junit.framework.TestCase;
|
|||||||
* against HWPF
|
* against HWPF
|
||||||
*/
|
*/
|
||||||
public class TestBugs{
|
public class TestBugs{
|
||||||
|
|
||||||
private static final POILogger logger = POILogFactory.getLogger(TestBugs.class);
|
private static final POILogger logger = POILogFactory.getLogger(TestBugs.class);
|
||||||
|
|
||||||
public static void assertEqualsIgnoreNewline(String expected, String actual )
|
public static void assertEqualsIgnoreNewline(String expected, String actual )
|
||||||
@ -536,13 +537,6 @@ public class TestBugs{
|
|||||||
hwpfDocument.getPicturesTable().getAllPictures();
|
hwpfDocument.getPicturesTable().getAllPictures();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* [FAILING] Bug 50955 - error while retrieving the text file
|
|
||||||
*/
|
|
||||||
@Test(expected=IllegalStateException.class)
|
|
||||||
public void test50955() throws IOException {
|
|
||||||
getTextOldFile("Bug50955.doc");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta
|
* [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta
|
||||||
|
@ -17,14 +17,19 @@
|
|||||||
|
|
||||||
package org.apache.poi.hwpf.usermodel;
|
package org.apache.poi.hwpf.usermodel;
|
||||||
|
|
||||||
|
import static org.apache.poi.POITestCase.assertContains;
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
|
||||||
import org.apache.poi.OldFileFormatException;
|
import org.apache.poi.OldFileFormatException;
|
||||||
|
import org.apache.poi.hwmf.record.HwmfFont;
|
||||||
import org.apache.poi.hwpf.HWPFOldDocument;
|
import org.apache.poi.hwpf.HWPFOldDocument;
|
||||||
import org.apache.poi.hwpf.HWPFTestCase;
|
import org.apache.poi.hwpf.HWPFTestCase;
|
||||||
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
||||||
|
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
||||||
|
import org.apache.poi.hwpf.model.OldFontTable;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -98,7 +103,7 @@ public final class TestHWPFOldDocument extends HWPFTestCase {
|
|||||||
assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
|
assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
|
||||||
// Normal, superscript for 4th, normal
|
// Normal, superscript for 4th, normal
|
||||||
assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
|
assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
|
||||||
|
|
||||||
doc.close();
|
doc.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,4 +148,87 @@ public final class TestHWPFOldDocument extends HWPFTestCase {
|
|||||||
doc.getRange().getParagraph(1).text());
|
doc.getRange().getParagraph(1).text());
|
||||||
doc.close();
|
doc.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDefaultCodePageEncoding() throws IOException {
|
||||||
|
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942.doc");
|
||||||
|
Word6Extractor ex = new Word6Extractor(doc);
|
||||||
|
String txt = ex.getText();
|
||||||
|
assertContains(txt, "BERTHOD");
|
||||||
|
assertContains(txt, "APPLICOLOR");
|
||||||
|
assertContains(txt, "les meilleurs");
|
||||||
|
assertContains(txt, "GUY LECOLE");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCodePageBug50955() throws IOException {
|
||||||
|
//windows 1251
|
||||||
|
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug50955.doc");
|
||||||
|
Word6Extractor ex = new Word6Extractor(doc);
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (String p : ex.getParagraphText()) {
|
||||||
|
sb.append(p);
|
||||||
|
}
|
||||||
|
assertContains(sb.toString(), "\u043F\u0440\u0438\u0432\u0435\u0442");//Greetings!
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCodePageBug60936() throws IOException {
|
||||||
|
//windows 1250 -- this test file was generated with OpenOffice
|
||||||
|
//see https://bz.apache.org/ooo/show_bug.cgi?id=12445 for the inspiration
|
||||||
|
|
||||||
|
|
||||||
|
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60936.doc");
|
||||||
|
Word6Extractor ex = new Word6Extractor(doc);
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (String p : ex.getParagraphText()) {
|
||||||
|
sb.append(p);
|
||||||
|
}
|
||||||
|
assertContains(sb.toString(), "4 sk\u00f3re a p\u0159ed 7 lety");//Greetings!
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOldFontTableEncoding() throws IOException {
|
||||||
|
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
|
||||||
|
OldFontTable oldFontTable = doc.getOldFontTable();
|
||||||
|
assertEquals(5, oldFontTable.getFontNames().length);
|
||||||
|
assertEquals("\u7D30\u660E\u9AD4", oldFontTable.getFontNames()[0].getMainFontName());
|
||||||
|
assertEquals(HwmfFont.WmfCharset.CHINESEBIG5_CHARSET.getCharset(), Charset.forName("Big5"));
|
||||||
|
assertEquals("Times New Roman", oldFontTable.getFontNames()[1].getMainFontName());
|
||||||
|
doc.close();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOldFontTableAltName() throws IOException {
|
||||||
|
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942b.doc");
|
||||||
|
OldFontTable oldFontTable = doc.getOldFontTable();
|
||||||
|
assertEquals(5, oldFontTable.getFontNames().length);
|
||||||
|
assertEquals("Roboto", oldFontTable.getFontNames()[3].getMainFontName());
|
||||||
|
assertEquals("arial", oldFontTable.getFontNames()[3].getAltFontName());
|
||||||
|
assertEquals("Roboto", oldFontTable.getFontNames()[4].getMainFontName());
|
||||||
|
assertEquals("arial", oldFontTable.getFontNames()[4].getAltFontName());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test51944() throws IOException {
|
||||||
|
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
|
||||||
|
Word6Extractor ex = new Word6Extractor(doc);
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (String p : ex.getParagraphText()) {
|
||||||
|
sb.append(p.replaceAll("[\r\n]+", "\n"));
|
||||||
|
}
|
||||||
|
String txt = sb.toString();
|
||||||
|
assertContains(txt, "Post and Fax");
|
||||||
|
assertContains(txt, "also maintain");//this is at a critical juncture
|
||||||
|
assertContains(txt, "which are available for");//this too
|
||||||
|
|
||||||
|
//TODO: figure out why these two aren't passing
|
||||||
|
// assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
|
||||||
|
// assertContains(txt, "We are able to");//not sure if we can get this easily?
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
BIN
test-data/document/Bug60936.doc
Normal file
BIN
test-data/document/Bug60936.doc
Normal file
Binary file not shown.
BIN
test-data/document/Bug60942.doc
Normal file
BIN
test-data/document/Bug60942.doc
Normal file
Binary file not shown.
BIN
test-data/document/Bug60942b.doc
Normal file
BIN
test-data/document/Bug60942b.doc
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user