Bug 49541 - Mapping of symbol characters to unicode equivalent

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1648415 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andreas Beeker 2014-12-29 19:43:35 +00:00
parent 8e3eee35cd
commit 26ae1edcd8
3 changed files with 263 additions and 0 deletions

View File

@ -20,7 +20,9 @@ package org.apache.poi.util;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.text.FieldPosition; import java.text.FieldPosition;
import java.text.NumberFormat; import java.text.NumberFormat;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map;
import org.apache.poi.hssf.record.RecordInputStream; import org.apache.poi.hssf.record.RecordInputStream;
/** /**
@ -37,6 +39,7 @@ import org.apache.poi.hssf.record.RecordInputStream;
public class StringUtil { public class StringUtil {
private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1"); private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
private static final Charset UTF16LE = Charset.forName("UTF-16LE"); private static final Charset UTF16LE = Charset.forName("UTF-16LE");
private static Map<Integer,Integer> msCodepointToUnicode;
private StringUtil() { private StringUtil() {
// no instances of this class // no instances of this class
@ -396,4 +399,248 @@ public class StringUtil {
} }
public void remove() {} public void remove() {}
} }
/**
* Some strings may contain encoded characters of the unicode private use area.
* Currently the characters of the symbol fonts are mapped to the corresponding
* characters in the normal unicode range.
*
* @param string the original string
* @return the string with mapped characters
*
* @see <a href="http://www.alanwood.net/unicode/private_use_area.html#symbol">Private Use Area (symbol)</a>
* @see <a href="http://www.alanwood.net/demos/symbol.html">Symbol font - Unicode alternatives for Greek and special characters in HTML</a>
*/
public static String mapMsCodepointString(String string) {
if (string == null || "".equals(string)) return string;
initMsCodepointMap();
StringBuilder sb = new StringBuilder();
final int length = string.length();
for (int offset = 0; offset < length; ) {
Integer msCodepoint = string.codePointAt(offset);
Integer uniCodepoint = msCodepointToUnicode.get(msCodepoint);
sb.appendCodePoint(uniCodepoint == null ? msCodepoint : uniCodepoint);
offset += Character.charCount(msCodepoint);
}
return sb.toString();
}
public static synchronized void mapMsCodepoint(int msCodepoint, int unicodeCodepoint) {
initMsCodepointMap();
msCodepointToUnicode.put(msCodepoint, unicodeCodepoint);
}
private static synchronized void initMsCodepointMap() {
if (msCodepointToUnicode != null) return;
msCodepointToUnicode = new HashMap<Integer,Integer>();
int i=0xF020;
for (int ch : symbolMap_f020) {
msCodepointToUnicode.put(i++, ch);
}
i = 0xf0a0;
for (int ch : symbolMap_f0a0) {
msCodepointToUnicode.put(i++, ch);
}
}
private static final int symbolMap_f020[] = {
' ', // 0xf020 space
'!', // 0xf021 exclam
8704, // 0xf022 universal
'#', // 0xf023 numbersign
8707, // 0xf024 existential
'%', // 0xf025 percent
'&', // 0xf026 ampersand
8717, // 0xf027 suchthat
'(', // 0xf028 parenleft
')', // 0xf029 parentright
8727, // 0xf02a asteriskmath
'+', // 0xf02b plus
',', // 0xf02c comma
8722, // 0xf02d minus sign (long -)
'.', // 0xf02e period
'/', // 0xf02f slash
'0', // 0xf030 0
'1', // 0xf031 1
'2', // 0xf032 2
'3', // 0xf033 3
'4', // 0xf034 4
'5', // 0xf035 5
'6', // 0xf036 6
'7', // 0xf037 7
'8', // 0xf038 8
'9', // 0xf039 9
':', // 0xf03a colon
';', // 0xf03b semicolon
'<', // 0xf03c less
'=', // 0xf03d equal
'>', // 0xf03e greater
'?', // 0xf03f question
8773, // 0xf040 congruent
913, // 0xf041 alpha (upper)
914, // 0xf042 beta (upper)
935, // 0xf043 chi (upper)
916, // 0xf044 delta (upper)
917, // 0xf045 epsilon (upper)
934, // 0xf046 phi (upper)
915, // 0xf047 gamma (upper)
919, // 0xf048 eta (upper)
921, // 0xf049 iota (upper)
977, // 0xf04a theta1 (lower)
922, // 0xf04b kappa (upper)
923, // 0xf04c lambda (upper)
924, // 0xf04d mu (upper)
925, // 0xf04e nu (upper)
927, // 0xf04f omicron (upper)
928, // 0xf050 pi (upper)
920, // 0xf051 theta (upper)
929, // 0xf052 rho (upper)
931, // 0xf053 sigma (upper)
932, // 0xf054 tau (upper)
933, // 0xf055 upsilon (upper)
962, // 0xf056 simga1 (lower)
937, // 0xf057 omega (upper)
926, // 0xf058 xi (upper)
936, // 0xf059 psi (upper)
918, // 0xf05a zeta (upper)
'[', // 0xf05b bracketleft
8765, // 0xf05c therefore
']', // 0xf05d bracketright
8869, // 0xf05e perpendicular
'_', // 0xf05f underscore
' ', // 0xf060 radicalex (doesn't exist in unicode)
945, // 0xf061 alpha (lower)
946, // 0xf062 beta (lower)
967, // 0xf063 chi (lower)
948, // 0xf064 delta (lower)
949, // 0xf065 epsilon (lower)
966, // 0xf066 phi (lower)
947, // 0xf067 gamma (lower)
951, // 0xf068 eta (lower)
953, // 0xf069 iota (lower)
981, // 0xf06a phi1 (lower)
954, // 0xf06b kappa (lower)
955, // 0xf06c lambda (lower)
956, // 0xf06d mu (lower)
957, // 0xf06e nu (lower)
959, // 0xf06f omnicron (lower)
960, // 0xf070 pi (lower)
952, // 0xf071 theta (lower)
961, // 0xf072 rho (lower)
963, // 0xf073 sigma (lower)
964, // 0xf074 tau (lower)
965, // 0xf075 upsilon (lower)
982, // 0xf076 piv (lower)
969, // 0xf077 omega (lower)
958, // 0xf078 xi (lower)
968, // 0xf079 psi (lower)
950, // 0xf07a zeta (lower)
'{', // 0xf07b braceleft
'|', // 0xf07c bar
'}', // 0xf07d braceright
8764, // 0xf07e similar '~'
' ', // 0xf07f not defined
};
private static final int symbolMap_f0a0[] = {
8364, // 0xf0a0 not defined / euro symbol
978, // 0xf0a1 upsilon1 (upper)
8242, // 0xf0a2 minute
8804, // 0xf0a3 lessequal
8260, // 0xf0a4 fraction
8734, // 0xf0a5 infinity
402, // 0xf0a6 florin
9827, // 0xf0a7 club
9830, // 0xf0a8 diamond
9829, // 0xf0a9 heart
9824, // 0xf0aa spade
8596, // 0xf0ab arrowboth
8591, // 0xf0ac arrowleft
8593, // 0xf0ad arrowup
8594, // 0xf0ae arrowright
8595, // 0xf0af arrowdown
176, // 0xf0b0 degree
177, // 0xf0b1 plusminus
8243, // 0xf0b2 second
8805, // 0xf0b3 greaterequal
215, // 0xf0b4 multiply
181, // 0xf0b5 proportional
8706, // 0xf0b6 partialdiff
8729, // 0xf0b7 bullet
247, // 0xf0b8 divide
8800, // 0xf0b9 notequal
8801, // 0xf0ba equivalence
8776, // 0xf0bb approxequal
8230, // 0xf0bc ellipsis
9168, // 0xf0bd arrowvertex
9135, // 0xf0be arrowhorizex
8629, // 0xf0bf carriagereturn
8501, // 0xf0c0 aleph
8475, // 0xf0c1 Ifraktur
8476, // 0xf0c2 Rfraktur
8472, // 0xf0c3 weierstrass
8855, // 0xf0c4 circlemultiply
8853, // 0xf0c5 circleplus
8709, // 0xf0c6 emptyset
8745, // 0xf0c7 intersection
8746, // 0xf0c8 union
8835, // 0xf0c9 propersuperset
8839, // 0xf0ca reflexsuperset
8836, // 0xf0cb notsubset
8834, // 0xf0cc propersubset
8838, // 0xf0cd reflexsubset
8712, // 0xf0ce element
8713, // 0xf0cf notelement
8736, // 0xf0d0 angle
8711, // 0xf0d1 gradient
174, // 0xf0d2 registerserif
169, // 0xf0d3 copyrightserif
8482, // 0xf0d4 trademarkserif
8719, // 0xf0d5 product
8730, // 0xf0d6 radical
8901, // 0xf0d7 dotmath
172, // 0xf0d8 logicalnot
8743, // 0xf0d9 logicaland
8744, // 0xf0da logicalor
8660, // 0xf0db arrowdblboth
8656, // 0xf0dc arrowdblleft
8657, // 0xf0dd arrowdblup
8658, // 0xf0de arrowdblright
8659, // 0xf0df arrowdbldown
9674, // 0xf0e0 lozenge
9001, // 0xf0e1 angleleft
174, // 0xf0e2 registersans
169, // 0xf0e3 copyrightsans
8482, // 0xf0e4 trademarksans
8721, // 0xf0e5 summation
9115, // 0xf0e6 parenlefttp
9116, // 0xf0e7 parenleftex
9117, // 0xf0e8 parenleftbt
9121, // 0xf0e9 bracketlefttp
9122, // 0xf0ea bracketleftex
9123, // 0xf0eb bracketleftbt
9127, // 0xf0ec bracelefttp
9128, // 0xf0ed braceleftmid
9129, // 0xf0ee braceleftbt
9130, // 0xf0ef braceex
' ', // 0xf0f0 not defined
9002, // 0xf0f1 angleright
8747, // 0xf0f2 integral
8992, // 0xf0f3 integraltp
9134, // 0xf0f4 integralex
8993, // 0xf0f5 integralbt
9118, // 0xf0f6 parenrighttp
9119, // 0xf0f7 parenrightex
9120, // 0xf0f8 parenrightbt
9124, // 0xf0f9 bracketrighttp
9125, // 0xf0fa bracketrightex
9126, // 0xf0fb bracketrightbt
9131, // 0xf0fc bracerighttp
9132, // 0xf0fd bracerightmid
9133, // 0xf0fe bracerightbt
' ', // 0xf0ff not defined
};
} }

View File

@ -58,6 +58,7 @@ import org.apache.poi.hslf.record.Record;
import org.apache.poi.hslf.record.SlideListWithText; import org.apache.poi.hslf.record.SlideListWithText;
import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet; import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet;
import org.apache.poi.hslf.record.TextHeaderAtom; import org.apache.poi.hslf.record.TextHeaderAtom;
import org.apache.poi.util.StringUtil;
import org.junit.Test; import org.junit.Test;
/** /**
@ -579,4 +580,19 @@ public final class TestBugs {
inputStream.close(); inputStream.close();
} }
} }
@Test
public void bug49541() throws Exception {
InputStream inputStream = new FileInputStream(_slTests.getFile("49541_symbol_map.ppt"));
try {
SlideShow slideShow = new SlideShow(inputStream);
Slide slide = slideShow.getSlides()[0];
ShapeGroup sg = (ShapeGroup)slide.getShapes()[0];
TextBox tb = (TextBox)sg.getShapes()[0];
String text = StringUtil.mapMsCodepointString(tb.getText());
assertEquals("\u226575 years", text);
} finally {
inputStream.close();
}
}
} }

Binary file not shown.