Bug 60608 -- improve charset handling in Hwmf
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1779519 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f6388c2fff
commit
a9436e9789
@ -29,6 +29,7 @@ import java.awt.font.TextAttribute;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.AttributedString;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
@ -48,8 +49,11 @@ import org.apache.poi.hwmf.record.HwmfPenStyle.HwmfLineDash;
|
||||
import org.apache.poi.sl.draw.DrawFactory;
|
||||
import org.apache.poi.sl.draw.DrawFontManager;
|
||||
import org.apache.poi.sl.draw.Drawable;
|
||||
import org.apache.poi.util.LocaleUtil;
|
||||
|
||||
public class HwmfGraphics {
|
||||
|
||||
private static final Charset DEFAULT_CHARSET = LocaleUtil.CHARSET_1252;
|
||||
private final Graphics2D graphicsCtx;
|
||||
private final List<HwmfDrawProperties> propStack = new LinkedList<HwmfDrawProperties>();
|
||||
private HwmfDrawProperties prop = new HwmfDrawProperties();
|
||||
@ -311,14 +315,34 @@ public class HwmfGraphics {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param text
|
||||
* @param bounds
|
||||
* @deprecated use {@link #drawString(byte[], Rectangle2D)}
|
||||
*/
|
||||
public void drawString(String text, Rectangle2D bounds) {
|
||||
drawString(text, bounds, null);
|
||||
}
|
||||
|
||||
|
||||
public void drawString(byte[] text, Rectangle2D bounds) {
|
||||
drawString(text, bounds, null);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param text
|
||||
* @param bounds
|
||||
* @deprecated use {@link #drawString(byte[], Rectangle2D, int[])}
|
||||
*/
|
||||
public void drawString(String text, Rectangle2D bounds, int dx[]) {
|
||||
drawString(text.getBytes(DEFAULT_CHARSET), bounds, dx);
|
||||
}
|
||||
|
||||
public void drawString(byte[] text, Rectangle2D bounds, int dx[]) {
|
||||
HwmfFont font = prop.getFont();
|
||||
if (font == null || text == null || text.isEmpty()) {
|
||||
if (font == null || text == null || text.length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -326,8 +350,11 @@ public class HwmfGraphics {
|
||||
// TODO: another approx. ...
|
||||
double fontW = fontH/1.8;
|
||||
|
||||
int len = text.length();
|
||||
AttributedString as = new AttributedString(text);
|
||||
int len = text.length;
|
||||
Charset charset = (font.getCharSet().getCharset() == null)?
|
||||
DEFAULT_CHARSET : font.getCharSet().getCharset();
|
||||
String textString = new String(text, charset);
|
||||
AttributedString as = new AttributedString(textString);
|
||||
if (dx == null || dx.length == 0) {
|
||||
addAttributes(as, font);
|
||||
} else {
|
||||
|
@ -19,67 +19,93 @@ package org.apache.poi.hwmf.record;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.UnsupportedCharsetException;
|
||||
|
||||
import org.apache.poi.util.LittleEndianConsts;
|
||||
import org.apache.poi.util.LittleEndianInputStream;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
/**
|
||||
* The Font object specifies the attributes of a logical font
|
||||
*/
|
||||
public class HwmfFont {
|
||||
|
||||
private static final POILogger logger = POILogFactory.getLogger(HwmfFont.class);
|
||||
|
||||
public enum WmfCharset {
|
||||
/** Specifies the English character set. */
|
||||
ANSI_CHARSET(0x00000000),
|
||||
ANSI_CHARSET(0x00000000, "Cp1252"),
|
||||
/**
|
||||
* Specifies a character set based on the current system locale;
|
||||
* for example, when the system locale is United States English,
|
||||
* the default character set is ANSI_CHARSET.
|
||||
*/
|
||||
DEFAULT_CHARSET(0x00000001),
|
||||
DEFAULT_CHARSET(0x00000001, "Cp1252"),
|
||||
/** Specifies a character set of symbols. */
|
||||
SYMBOL_CHARSET(0x00000002),
|
||||
SYMBOL_CHARSET(0x00000002, ""),
|
||||
/** Specifies the Apple Macintosh character set. */
|
||||
MAC_CHARSET(0x0000004D),
|
||||
MAC_CHARSET(0x0000004D, "MacRoman"),
|
||||
/** Specifies the Japanese character set. */
|
||||
SHIFTJIS_CHARSET(0x00000080),
|
||||
SHIFTJIS_CHARSET(0x00000080, "Shift_JIS"),
|
||||
/** Also spelled "Hangeul". Specifies the Hangul Korean character set. */
|
||||
HANGUL_CHARSET(0x00000081),
|
||||
HANGUL_CHARSET(0x00000081, "cp949"),
|
||||
/** Also spelled "Johap". Specifies the Johab Korean character set. */
|
||||
JOHAB_CHARSET(0x00000082),
|
||||
JOHAB_CHARSET(0x00000082, "x-Johab"),
|
||||
/** Specifies the "simplified" Chinese character set for People's Republic of China. */
|
||||
GB2312_CHARSET(0x00000086),
|
||||
GB2312_CHARSET(0x00000086, "GB2312"),
|
||||
/**
|
||||
* Specifies the "traditional" Chinese character set, used mostly in
|
||||
* Taiwan and in the Hong Kong and Macao Special Administrative Regions.
|
||||
*/
|
||||
CHINESEBIG5_CHARSET(0x00000088),
|
||||
CHINESEBIG5_CHARSET(0x00000088, "Big5"),
|
||||
/** Specifies the Greek character set. */
|
||||
GREEK_CHARSET(0x000000A1),
|
||||
GREEK_CHARSET(0x000000A1, "Cp1253"),
|
||||
/** Specifies the Turkish character set. */
|
||||
TURKISH_CHARSET(0x000000A2),
|
||||
TURKISH_CHARSET(0x000000A2, "Cp1254"),
|
||||
/** Specifies the Vietnamese character set. */
|
||||
VIETNAMESE_CHARSET(0x000000A3),
|
||||
VIETNAMESE_CHARSET(0x000000A3, "Cp1258"),
|
||||
/** Specifies the Hebrew character set. */
|
||||
HEBREW_CHARSET(0x000000B1),
|
||||
HEBREW_CHARSET(0x000000B1, "Cp1255"),
|
||||
/** Specifies the Arabic character set. */
|
||||
ARABIC_CHARSET(0x000000B2),
|
||||
ARABIC_CHARSET(0x000000B2, "Cp1256"),
|
||||
/** Specifies the Baltic (Northeastern European) character set. */
|
||||
BALTIC_CHARSET(0x000000BA),
|
||||
BALTIC_CHARSET(0x000000BA, "Cp1257"),
|
||||
/** Specifies the Russian Cyrillic character set. */
|
||||
RUSSIAN_CHARSET(0x000000CC),
|
||||
RUSSIAN_CHARSET(0x000000CC, "Cp1251"),
|
||||
/** Specifies the Thai character set. */
|
||||
THAI_CHARSET(0x000000DE),
|
||||
THAI_CHARSET(0x000000DE, "x-windows-874"),
|
||||
/** Specifies a Eastern European character set. */
|
||||
EASTEUROPE_CHARSET(0x000000EE),
|
||||
EASTEUROPE_CHARSET(0x000000EE, "Cp1250"),
|
||||
/**
|
||||
* Specifies a mapping to one of the OEM code pages,
|
||||
* according to the current system locale setting.
|
||||
*/
|
||||
OEM_CHARSET(0x000000FF);
|
||||
OEM_CHARSET(0x000000FF, "Cp1252");
|
||||
|
||||
int flag;
|
||||
WmfCharset(int flag) {
|
||||
Charset charset;
|
||||
|
||||
WmfCharset(int flag, String javaCharsetName) {
|
||||
this.flag = flag;
|
||||
if (javaCharsetName.length() > 0) {
|
||||
try {
|
||||
charset = Charset.forName(javaCharsetName);
|
||||
return;
|
||||
} catch (UnsupportedCharsetException e) {
|
||||
logger.log(POILogger.WARN, "Unsupported charset: "+javaCharsetName);
|
||||
}
|
||||
}
|
||||
charset = null;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return charset for the font or <code>null</code> if there is no matching charset or
|
||||
* if the charset is a "default"
|
||||
*/
|
||||
public Charset getCharset() {
|
||||
return charset;
|
||||
}
|
||||
|
||||
static WmfCharset valueOf(int flag) {
|
||||
|
@ -19,6 +19,7 @@ package org.apache.poi.hwmf.record;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import org.apache.poi.hwmf.draw.HwmfDrawProperties;
|
||||
import org.apache.poi.hwmf.draw.HwmfGraphics;
|
||||
@ -27,7 +28,6 @@ import org.apache.poi.util.BitField;
|
||||
import org.apache.poi.util.BitFieldFactory;
|
||||
import org.apache.poi.util.LittleEndianConsts;
|
||||
import org.apache.poi.util.LittleEndianInputStream;
|
||||
import org.apache.poi.util.LocaleUtil;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
@ -144,7 +144,7 @@ public class HwmfText {
|
||||
* length of the string.
|
||||
* The string is written at the location specified by the XStart and YStart fields.
|
||||
*/
|
||||
private String text;
|
||||
private byte[] rawTextBytes;
|
||||
/**
|
||||
* A 16-bit signed integer that defines the vertical (y-axis) coordinate, in logical
|
||||
* units, of the point where drawing is to start.
|
||||
@ -164,18 +164,33 @@ public class HwmfText {
|
||||
@Override
|
||||
public int init(LittleEndianInputStream leis, long recordSize, int recordFunction) throws IOException {
|
||||
stringLength = leis.readShort();
|
||||
byte buf[] = new byte[stringLength+(stringLength&1)];
|
||||
leis.readFully(buf);
|
||||
text = new String(buf, 0, stringLength, LocaleUtil.CHARSET_1252).trim();
|
||||
rawTextBytes = new byte[stringLength+(stringLength&1)];
|
||||
leis.readFully(rawTextBytes);
|
||||
yStart = leis.readShort();
|
||||
xStart = leis.readShort();
|
||||
return 3*LittleEndianConsts.SHORT_SIZE+buf.length;
|
||||
return 3*LittleEndianConsts.SHORT_SIZE+rawTextBytes.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void draw(HwmfGraphics ctx) {
|
||||
Rectangle2D bounds = new Rectangle2D.Double(xStart, yStart, 0, 0);
|
||||
ctx.drawString(text, bounds);
|
||||
ctx.drawString(getTextBytes(), bounds);
|
||||
}
|
||||
|
||||
public String getText(Charset charset) {
|
||||
return new String(getTextBytes(), charset);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return a copy of a trimmed byte array of rawTextBytes bytes.
|
||||
* This includes only the bytes from 0..stringLength.
|
||||
* This does not include the extra optional padding on the byte array.
|
||||
*/
|
||||
private byte[] getTextBytes() {
|
||||
byte[] ret = new byte[stringLength];
|
||||
System.arraycopy(rawTextBytes, 0, ret, 0, stringLength);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
@ -264,7 +279,7 @@ public class HwmfText {
|
||||
* the length is odd, an extra byte is placed after it so that the following member (optional Dx) is
|
||||
* aligned on a 16-bit boundary.
|
||||
*/
|
||||
private String text;
|
||||
private byte[] rawTextBytes;
|
||||
/**
|
||||
* An optional array of 16-bit signed integers that indicate the distance between
|
||||
* origins of adjacent character cells. For example, Dx[i] logical units separate the origins of
|
||||
@ -300,10 +315,9 @@ public class HwmfText {
|
||||
size += 4*LittleEndianConsts.SHORT_SIZE;
|
||||
}
|
||||
|
||||
byte buf[] = new byte[stringLength+(stringLength&1)];
|
||||
leis.readFully(buf);
|
||||
text = new String(buf, 0, stringLength, LocaleUtil.CHARSET_1252);
|
||||
size += buf.length;
|
||||
rawTextBytes = new byte[stringLength+(stringLength&1)];
|
||||
leis.readFully(rawTextBytes);
|
||||
size += rawTextBytes.length;
|
||||
|
||||
if (size >= remainingRecordSize) {
|
||||
logger.log(POILogger.INFO, "META_EXTTEXTOUT doesn't contain character tracking info");
|
||||
@ -327,7 +341,23 @@ public class HwmfText {
|
||||
@Override
|
||||
public void draw(HwmfGraphics ctx) {
|
||||
Rectangle2D bounds = new Rectangle2D.Double(x, y, 0, 0);
|
||||
ctx.drawString(text, bounds, dx);
|
||||
ctx.drawString(getTextBytes(), bounds, dx);
|
||||
}
|
||||
|
||||
public String getText(Charset charset) {
|
||||
return new String(getTextBytes(), charset);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return a copy of a trimmed byte array of rawTextBytes bytes.
|
||||
* This includes only the bytes from 0..stringLength.
|
||||
* This does not include the extra optional padding on the byte array.
|
||||
*/
|
||||
private byte[] getTextBytes() {
|
||||
byte[] ret = new byte[stringLength];
|
||||
System.arraycopy(rawTextBytes, 0, ret, 0, stringLength);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
@ -523,5 +553,9 @@ public class HwmfText {
|
||||
public void applyObject(HwmfGraphics ctx) {
|
||||
ctx.getProperties().setFont(font);
|
||||
}
|
||||
|
||||
public HwmfFont getFont() {
|
||||
return font;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -18,7 +18,9 @@
|
||||
package org.apache.poi.hwmf;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.Dimension;
|
||||
import java.awt.Graphics2D;
|
||||
import java.awt.RenderingHints;
|
||||
@ -31,21 +33,24 @@ import java.io.FileOutputStream;
|
||||
import java.io.FilterInputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.hwmf.record.HwmfFill.HwmfImageRecord;
|
||||
import org.apache.poi.hwmf.record.HwmfFont;
|
||||
import org.apache.poi.hwmf.record.HwmfRecord;
|
||||
import org.apache.poi.hwmf.record.HwmfRecordType;
|
||||
import org.apache.poi.hwmf.record.HwmfText;
|
||||
import org.apache.poi.hwmf.usermodel.HwmfPicture;
|
||||
import org.apache.poi.sl.usermodel.PictureData;
|
||||
import org.apache.poi.sl.usermodel.PictureData.PictureType;
|
||||
import org.apache.poi.sl.usermodel.SlideShow;
|
||||
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
||||
import org.apache.poi.util.LocaleUtil;
|
||||
import org.apache.poi.util.Units;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
@ -188,4 +193,33 @@ public class TestHwmfParsing {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore("If we decide we can use common crawl file specified, we can turn this back on")
|
||||
public void testCyrillic() throws Exception {
|
||||
//TODO: move test file to framework and fix this
|
||||
File dir = new File("C:/somethingOrOther");
|
||||
File f = new File(dir, "ZMLH54SPLI76NQ7XMKVB7SMUJA2HTXTS-2.wmf");
|
||||
HwmfPicture wmf = new HwmfPicture(new FileInputStream(f));
|
||||
|
||||
Charset charset = LocaleUtil.CHARSET_1252;
|
||||
StringBuilder sb = new StringBuilder();
|
||||
//this is pure hackery for specifying the font
|
||||
//this happens to work on this test file, but you need to
|
||||
//do what Graphics does by maintaining the stack, etc.!
|
||||
for (HwmfRecord r : wmf.getRecords()) {
|
||||
if (r.getRecordType().equals(HwmfRecordType.createFontIndirect)) {
|
||||
HwmfFont font = ((HwmfText.WmfCreateFontIndirect)r).getFont();
|
||||
charset = (font.getCharSet().getCharset() == null) ? LocaleUtil.CHARSET_1252 : font.getCharSet().getCharset();
|
||||
}
|
||||
if (r.getRecordType().equals(HwmfRecordType.extTextOut)) {
|
||||
HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut)r;
|
||||
sb.append(textOut.getText(charset)).append("\n");
|
||||
}
|
||||
}
|
||||
String txt = sb.toString();
|
||||
assertTrue(txt.contains("\u041E\u0431\u0449\u043E"));
|
||||
assertTrue(txt.contains("\u0411\u0430\u043B\u0430\u043D\u0441"));
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user