Bug 60608 -- improve charset handling in Hwmf

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1779519 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tim Allison 2017-01-19 20:19:26 +00:00
parent f6388c2fff
commit a9436e9789
4 changed files with 161 additions and 40 deletions

View File

@ -29,6 +29,7 @@ import java.awt.font.TextAttribute;
import java.awt.geom.AffineTransform; import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage; import java.awt.image.BufferedImage;
import java.nio.charset.Charset;
import java.text.AttributedString; import java.text.AttributedString;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.LinkedList; import java.util.LinkedList;
@ -48,8 +49,11 @@ import org.apache.poi.hwmf.record.HwmfPenStyle.HwmfLineDash;
import org.apache.poi.sl.draw.DrawFactory; import org.apache.poi.sl.draw.DrawFactory;
import org.apache.poi.sl.draw.DrawFontManager; import org.apache.poi.sl.draw.DrawFontManager;
import org.apache.poi.sl.draw.Drawable; import org.apache.poi.sl.draw.Drawable;
import org.apache.poi.util.LocaleUtil;
public class HwmfGraphics { public class HwmfGraphics {
private static final Charset DEFAULT_CHARSET = LocaleUtil.CHARSET_1252;
private final Graphics2D graphicsCtx; private final Graphics2D graphicsCtx;
private final List<HwmfDrawProperties> propStack = new LinkedList<HwmfDrawProperties>(); private final List<HwmfDrawProperties> propStack = new LinkedList<HwmfDrawProperties>();
private HwmfDrawProperties prop = new HwmfDrawProperties(); private HwmfDrawProperties prop = new HwmfDrawProperties();
@ -312,13 +316,33 @@ public class HwmfGraphics {
} }
} }
/**
*
* @param text
* @param bounds
* @deprecated use {@link #drawString(byte[], Rectangle2D)}
*/
public void drawString(String text, Rectangle2D bounds) { public void drawString(String text, Rectangle2D bounds) {
drawString(text, bounds, null); drawString(text, bounds, null);
} }
public void drawString(byte[] text, Rectangle2D bounds) {
drawString(text, bounds, null);
}
/**
*
* @param text
* @param bounds
* @deprecated use {@link #drawString(byte[], Rectangle2D, int[])}
*/
public void drawString(String text, Rectangle2D bounds, int dx[]) { public void drawString(String text, Rectangle2D bounds, int dx[]) {
drawString(text.getBytes(DEFAULT_CHARSET), bounds, dx);
}
public void drawString(byte[] text, Rectangle2D bounds, int dx[]) {
HwmfFont font = prop.getFont(); HwmfFont font = prop.getFont();
if (font == null || text == null || text.isEmpty()) { if (font == null || text == null || text.length == 0) {
return; return;
} }
@ -326,8 +350,11 @@ public class HwmfGraphics {
// TODO: another approx. ... // TODO: another approx. ...
double fontW = fontH/1.8; double fontW = fontH/1.8;
int len = text.length(); int len = text.length;
AttributedString as = new AttributedString(text); Charset charset = (font.getCharSet().getCharset() == null)?
DEFAULT_CHARSET : font.getCharSet().getCharset();
String textString = new String(text, charset);
AttributedString as = new AttributedString(textString);
if (dx == null || dx.length == 0) { if (dx == null || dx.length == 0) {
addAttributes(as, font); addAttributes(as, font);
} else { } else {

View File

@ -19,67 +19,93 @@ package org.apache.poi.hwmf.record;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import org.apache.poi.util.LittleEndianConsts; import org.apache.poi.util.LittleEndianConsts;
import org.apache.poi.util.LittleEndianInputStream; import org.apache.poi.util.LittleEndianInputStream;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/** /**
* The Font object specifies the attributes of a logical font * The Font object specifies the attributes of a logical font
*/ */
public class HwmfFont { public class HwmfFont {
private static final POILogger logger = POILogFactory.getLogger(HwmfFont.class);
public enum WmfCharset { public enum WmfCharset {
/** Specifies the English character set. */ /** Specifies the English character set. */
ANSI_CHARSET(0x00000000), ANSI_CHARSET(0x00000000, "Cp1252"),
/** /**
* Specifies a character set based on the current system locale; * Specifies a character set based on the current system locale;
* for example, when the system locale is United States English, * for example, when the system locale is United States English,
* the default character set is ANSI_CHARSET. * the default character set is ANSI_CHARSET.
*/ */
DEFAULT_CHARSET(0x00000001), DEFAULT_CHARSET(0x00000001, "Cp1252"),
/** Specifies a character set of symbols. */ /** Specifies a character set of symbols. */
SYMBOL_CHARSET(0x00000002), SYMBOL_CHARSET(0x00000002, ""),
/** Specifies the Apple Macintosh character set. */ /** Specifies the Apple Macintosh character set. */
MAC_CHARSET(0x0000004D), MAC_CHARSET(0x0000004D, "MacRoman"),
/** Specifies the Japanese character set. */ /** Specifies the Japanese character set. */
SHIFTJIS_CHARSET(0x00000080), SHIFTJIS_CHARSET(0x00000080, "Shift_JIS"),
/** Also spelled "Hangeul". Specifies the Hangul Korean character set. */ /** Also spelled "Hangeul". Specifies the Hangul Korean character set. */
HANGUL_CHARSET(0x00000081), HANGUL_CHARSET(0x00000081, "cp949"),
/** Also spelled "Johap". Specifies the Johab Korean character set. */ /** Also spelled "Johap". Specifies the Johab Korean character set. */
JOHAB_CHARSET(0x00000082), JOHAB_CHARSET(0x00000082, "x-Johab"),
/** Specifies the "simplified" Chinese character set for People's Republic of China. */ /** Specifies the "simplified" Chinese character set for People's Republic of China. */
GB2312_CHARSET(0x00000086), GB2312_CHARSET(0x00000086, "GB2312"),
/** /**
* Specifies the "traditional" Chinese character set, used mostly in * Specifies the "traditional" Chinese character set, used mostly in
* Taiwan and in the Hong Kong and Macao Special Administrative Regions. * Taiwan and in the Hong Kong and Macao Special Administrative Regions.
*/ */
CHINESEBIG5_CHARSET(0x00000088), CHINESEBIG5_CHARSET(0x00000088, "Big5"),
/** Specifies the Greek character set. */ /** Specifies the Greek character set. */
GREEK_CHARSET(0x000000A1), GREEK_CHARSET(0x000000A1, "Cp1253"),
/** Specifies the Turkish character set. */ /** Specifies the Turkish character set. */
TURKISH_CHARSET(0x000000A2), TURKISH_CHARSET(0x000000A2, "Cp1254"),
/** Specifies the Vietnamese character set. */ /** Specifies the Vietnamese character set. */
VIETNAMESE_CHARSET(0x000000A3), VIETNAMESE_CHARSET(0x000000A3, "Cp1258"),
/** Specifies the Hebrew character set. */ /** Specifies the Hebrew character set. */
HEBREW_CHARSET(0x000000B1), HEBREW_CHARSET(0x000000B1, "Cp1255"),
/** Specifies the Arabic character set. */ /** Specifies the Arabic character set. */
ARABIC_CHARSET(0x000000B2), ARABIC_CHARSET(0x000000B2, "Cp1256"),
/** Specifies the Baltic (Northeastern European) character set. */ /** Specifies the Baltic (Northeastern European) character set. */
BALTIC_CHARSET(0x000000BA), BALTIC_CHARSET(0x000000BA, "Cp1257"),
/** Specifies the Russian Cyrillic character set. */ /** Specifies the Russian Cyrillic character set. */
RUSSIAN_CHARSET(0x000000CC), RUSSIAN_CHARSET(0x000000CC, "Cp1251"),
/** Specifies the Thai character set. */ /** Specifies the Thai character set. */
THAI_CHARSET(0x000000DE), THAI_CHARSET(0x000000DE, "x-windows-874"),
/** Specifies a Eastern European character set. */ /** Specifies a Eastern European character set. */
EASTEUROPE_CHARSET(0x000000EE), EASTEUROPE_CHARSET(0x000000EE, "Cp1250"),
/** /**
* Specifies a mapping to one of the OEM code pages, * Specifies a mapping to one of the OEM code pages,
* according to the current system locale setting. * according to the current system locale setting.
*/ */
OEM_CHARSET(0x000000FF); OEM_CHARSET(0x000000FF, "Cp1252");
int flag; int flag;
WmfCharset(int flag) { Charset charset;
WmfCharset(int flag, String javaCharsetName) {
this.flag = flag; this.flag = flag;
if (javaCharsetName.length() > 0) {
try {
charset = Charset.forName(javaCharsetName);
return;
} catch (UnsupportedCharsetException e) {
logger.log(POILogger.WARN, "Unsupported charset: "+javaCharsetName);
}
}
charset = null;
}
/**
*
* @return charset for the font or <code>null</code> if there is no matching charset or
* if the charset is a &quot;default&quot;
*/
public Charset getCharset() {
return charset;
} }
static WmfCharset valueOf(int flag) { static WmfCharset valueOf(int flag) {

View File

@ -19,6 +19,7 @@ package org.apache.poi.hwmf.record;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.Charset;
import org.apache.poi.hwmf.draw.HwmfDrawProperties; import org.apache.poi.hwmf.draw.HwmfDrawProperties;
import org.apache.poi.hwmf.draw.HwmfGraphics; import org.apache.poi.hwmf.draw.HwmfGraphics;
@ -27,7 +28,6 @@ import org.apache.poi.util.BitField;
import org.apache.poi.util.BitFieldFactory; import org.apache.poi.util.BitFieldFactory;
import org.apache.poi.util.LittleEndianConsts; import org.apache.poi.util.LittleEndianConsts;
import org.apache.poi.util.LittleEndianInputStream; import org.apache.poi.util.LittleEndianInputStream;
import org.apache.poi.util.LocaleUtil;
import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger; import org.apache.poi.util.POILogger;
@ -144,7 +144,7 @@ public class HwmfText {
* length of the string. * length of the string.
* The string is written at the location specified by the XStart and YStart fields. * The string is written at the location specified by the XStart and YStart fields.
*/ */
private String text; private byte[] rawTextBytes;
/** /**
* A 16-bit signed integer that defines the vertical (y-axis) coordinate, in logical * A 16-bit signed integer that defines the vertical (y-axis) coordinate, in logical
* units, of the point where drawing is to start. * units, of the point where drawing is to start.
@ -164,18 +164,33 @@ public class HwmfText {
@Override @Override
public int init(LittleEndianInputStream leis, long recordSize, int recordFunction) throws IOException { public int init(LittleEndianInputStream leis, long recordSize, int recordFunction) throws IOException {
stringLength = leis.readShort(); stringLength = leis.readShort();
byte buf[] = new byte[stringLength+(stringLength&1)]; rawTextBytes = new byte[stringLength+(stringLength&1)];
leis.readFully(buf); leis.readFully(rawTextBytes);
text = new String(buf, 0, stringLength, LocaleUtil.CHARSET_1252).trim();
yStart = leis.readShort(); yStart = leis.readShort();
xStart = leis.readShort(); xStart = leis.readShort();
return 3*LittleEndianConsts.SHORT_SIZE+buf.length; return 3*LittleEndianConsts.SHORT_SIZE+rawTextBytes.length;
} }
@Override @Override
public void draw(HwmfGraphics ctx) { public void draw(HwmfGraphics ctx) {
Rectangle2D bounds = new Rectangle2D.Double(xStart, yStart, 0, 0); Rectangle2D bounds = new Rectangle2D.Double(xStart, yStart, 0, 0);
ctx.drawString(text, bounds); ctx.drawString(getTextBytes(), bounds);
}
public String getText(Charset charset) {
return new String(getTextBytes(), charset);
}
/**
*
* @return a copy of a trimmed byte array of rawTextBytes bytes.
* This includes only the bytes from 0..stringLength.
* This does not include the extra optional padding on the byte array.
*/
private byte[] getTextBytes() {
byte[] ret = new byte[stringLength];
System.arraycopy(rawTextBytes, 0, ret, 0, stringLength);
return ret;
} }
} }
@ -264,7 +279,7 @@ public class HwmfText {
* the length is odd, an extra byte is placed after it so that the following member (optional Dx) is * the length is odd, an extra byte is placed after it so that the following member (optional Dx) is
* aligned on a 16-bit boundary. * aligned on a 16-bit boundary.
*/ */
private String text; private byte[] rawTextBytes;
/** /**
* An optional array of 16-bit signed integers that indicate the distance between * An optional array of 16-bit signed integers that indicate the distance between
* origins of adjacent character cells. For example, Dx[i] logical units separate the origins of * origins of adjacent character cells. For example, Dx[i] logical units separate the origins of
@ -300,10 +315,9 @@ public class HwmfText {
size += 4*LittleEndianConsts.SHORT_SIZE; size += 4*LittleEndianConsts.SHORT_SIZE;
} }
byte buf[] = new byte[stringLength+(stringLength&1)]; rawTextBytes = new byte[stringLength+(stringLength&1)];
leis.readFully(buf); leis.readFully(rawTextBytes);
text = new String(buf, 0, stringLength, LocaleUtil.CHARSET_1252); size += rawTextBytes.length;
size += buf.length;
if (size >= remainingRecordSize) { if (size >= remainingRecordSize) {
logger.log(POILogger.INFO, "META_EXTTEXTOUT doesn't contain character tracking info"); logger.log(POILogger.INFO, "META_EXTTEXTOUT doesn't contain character tracking info");
@ -327,7 +341,23 @@ public class HwmfText {
@Override @Override
public void draw(HwmfGraphics ctx) { public void draw(HwmfGraphics ctx) {
Rectangle2D bounds = new Rectangle2D.Double(x, y, 0, 0); Rectangle2D bounds = new Rectangle2D.Double(x, y, 0, 0);
ctx.drawString(text, bounds, dx); ctx.drawString(getTextBytes(), bounds, dx);
}
public String getText(Charset charset) {
return new String(getTextBytes(), charset);
}
/**
*
* @return a copy of a trimmed byte array of rawTextBytes bytes.
* This includes only the bytes from 0..stringLength.
* This does not include the extra optional padding on the byte array.
*/
private byte[] getTextBytes() {
byte[] ret = new byte[stringLength];
System.arraycopy(rawTextBytes, 0, ret, 0, stringLength);
return ret;
} }
} }
@ -523,5 +553,9 @@ public class HwmfText {
public void applyObject(HwmfGraphics ctx) { public void applyObject(HwmfGraphics ctx) {
ctx.getProperties().setFont(font); ctx.getProperties().setFont(font);
} }
public HwmfFont getFont() {
return font;
}
} }
} }

View File

@ -18,7 +18,9 @@
package org.apache.poi.hwmf; package org.apache.poi.hwmf;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import javax.imageio.ImageIO;
import java.awt.Dimension; import java.awt.Dimension;
import java.awt.Graphics2D; import java.awt.Graphics2D;
import java.awt.RenderingHints; import java.awt.RenderingHints;
@ -31,21 +33,24 @@ import java.io.FileOutputStream;
import java.io.FilterInputStream; import java.io.FilterInputStream;
import java.io.IOException; import java.io.IOException;
import java.net.URL; import java.net.URL;
import java.nio.charset.Charset;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream; import java.util.zip.ZipInputStream;
import javax.imageio.ImageIO;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.hwmf.record.HwmfFill.HwmfImageRecord; import org.apache.poi.hwmf.record.HwmfFill.HwmfImageRecord;
import org.apache.poi.hwmf.record.HwmfFont;
import org.apache.poi.hwmf.record.HwmfRecord; import org.apache.poi.hwmf.record.HwmfRecord;
import org.apache.poi.hwmf.record.HwmfRecordType;
import org.apache.poi.hwmf.record.HwmfText;
import org.apache.poi.hwmf.usermodel.HwmfPicture; import org.apache.poi.hwmf.usermodel.HwmfPicture;
import org.apache.poi.sl.usermodel.PictureData; import org.apache.poi.sl.usermodel.PictureData;
import org.apache.poi.sl.usermodel.PictureData.PictureType; import org.apache.poi.sl.usermodel.PictureData.PictureType;
import org.apache.poi.sl.usermodel.SlideShow; import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.sl.usermodel.SlideShowFactory; import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.LocaleUtil;
import org.apache.poi.util.Units; import org.apache.poi.util.Units;
import org.junit.Ignore; import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
@ -188,4 +193,33 @@ public class TestHwmfParsing {
} }
} }
} }
@Test
@Ignore("If we decide we can use common crawl file specified, we can turn this back on")
public void testCyrillic() throws Exception {
//TODO: move test file to framework and fix this
File dir = new File("C:/somethingOrOther");
File f = new File(dir, "ZMLH54SPLI76NQ7XMKVB7SMUJA2HTXTS-2.wmf");
HwmfPicture wmf = new HwmfPicture(new FileInputStream(f));
Charset charset = LocaleUtil.CHARSET_1252;
StringBuilder sb = new StringBuilder();
//this is pure hackery for specifying the font
//this happens to work on this test file, but you need to
//do what Graphics does by maintaining the stack, etc.!
for (HwmfRecord r : wmf.getRecords()) {
if (r.getRecordType().equals(HwmfRecordType.createFontIndirect)) {
HwmfFont font = ((HwmfText.WmfCreateFontIndirect)r).getFont();
charset = (font.getCharSet().getCharset() == null) ? LocaleUtil.CHARSET_1252 : font.getCharSet().getCharset();
}
if (r.getRecordType().equals(HwmfRecordType.extTextOut)) {
HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut)r;
sb.append(textOut.getText(charset)).append("\n");
}
}
String txt = sb.toString();
assertTrue(txt.contains("\u041E\u0431\u0449\u043E"));
assertTrue(txt.contains("\u0411\u0430\u043B\u0430\u043D\u0441"));
}
} }