diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml
index 77cb10f35..25dc4b6df 100644
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@@ -12,7 +12,11 @@
The property with ID 1 holds the number of the codepage which was used
- to encode the strings in this section. The present HPSF codepage support
- is still very limited: When reading property value strings, HPSF
- distinguishes between 16-bit characters and 8-bit characters. 16-bit
- characters should be Unicode characters and thus be okay. 8-bit
- characters are interpreted according to the platform's default character
- set. This is fine as long as the document being read has been written on
- a platform with the same default character set. However, if you receive a
- document from another region of the world and want to process it with
- HPSF you are in trouble - unless the creator used Unicode, of course.The property's value is the number of a codepage,
i.e. a mapping from character codes to characters. All strings in the
section containing this property must be interpreted using this
- codepage. Typical property values are 1252 (8-bit "western" characters)
- or 1200 (16-bit Unicode characters).
+ codepage. Typical property values are 1252 (8-bit "western" characters,
+ ISO-8859-1), 1200 (16-bit Unicode characters, UFT-16), or 65001 (8-bit
+ Unicode characters, UFT-8).
@@ -833,18 +834,34 @@ No property set stream: "/1Table"
HPSF's codepage support is as good as the character encoding support of + the Java Virtual Machine (JVM) the application runs on. If HPSF + encounters a codepage number it assumes that the JVM has a character + encoding with a corresponding name. For example, if the codepage is 1252, + HPSF uses the character encoding "cp1252" to read or write strings. If + the JVM does not have that character encoding installed or if the + codepage number is illegal, an UnsupportedEncodingException will be + thrown.
+ +There are two exceptions to the rule that a character encoding's name + is derived from the codepage number by prepending the string "cp" to + it:
+ +What a dictionary is good for is explained in the HPSF HOW-TO. This chapter explains how it is + organized internally.
+ +The dictionary has a simple header consisting of a single UInt value. It + tells how many entries the dictionary comprises:
+ +Name | +Data type | +Description | +
---|---|---|
nrEntries | +UInt | +Number of dictionary entries | +
The dictionary entries follow the header. Each one looks like this:
+ +Name | +Data type | +Description | +
---|---|---|
key | +UInt | +The unique number of this property, i.e. the PID | +
length | +UInt | +The length of the property name associated with the key | +
value | +String | +The property's name, terminated with a 0x00 character | +
The entries are not aligned, i.e. each one follows its predecessor + without any gap or fill characters.
+In order to assemble the HPSF description I used information publically diff --git a/src/documentation/content/xdocs/hpsf/todo.xml b/src/documentation/content/xdocs/hpsf/todo.xml index d99e7ff5d..3f50f8f4d 100644 --- a/src/documentation/content/xdocs/hpsf/todo.xml +++ b/src/documentation/content/xdocs/hpsf/todo.xml @@ -21,25 +21,20 @@ information streams.
org.apache.poi.hpsf.wellknown
to ease
- localizations. This would be useful for mapping standard property IDs to
- localized strings. Example: The property ID 4 could be mapped to "Author"
- in English or "Verfasser" in German.
+ Add resource bundles to
+ org.apache.poi.hpsf.wellknown
to ease
+ localizations. This would be useful for mapping standard property IDs to
+ localized strings. Example: The property ID 4 could be mapped to "Author"
+ in English or "Verfasser" in German.
java.awt.Image
example code in Thumbnail HOW TO.
+ Add WMF to java.awt.Image
example code in the Thumbnail HOW-TO.
Writes the property to an output stream.
* * @param out The output stream to write to. + * @param codepage The codepage to use for writing non-wide strings * @return the number of bytes written to the stream * * @exception IOException if an I/O error occurs * @exception WritingNotSupportedException if a variant type is to be * written that is not yet supported */ - public int write(final OutputStream out) + public int write(final OutputStream out, final int codepage) throws IOException, WritingNotSupportedException { int length = 0; long variantType = getType(); length += TypeWriter.writeUIntToStream(out, variantType); - length += VariantSupport.write(out, variantType, getValue()); + length += VariantSupport.write(out, variantType, getValue(), codepage); return length; } diff --git a/src/java/org/apache/poi/hpsf/MutableSection.java b/src/java/org/apache/poi/hpsf/MutableSection.java index 871c13360..c2fb33d6e 100644 --- a/src/java/org/apache/poi/hpsf/MutableSection.java +++ b/src/java/org/apache/poi/hpsf/MutableSection.java @@ -420,16 +420,16 @@ public class MutableSection extends Section /* If the property ID is not equal 0 we write the property and all * is fine. However, if it equals 0 we have to write the section's - * dictionary which does not have a type but just a value. */ + * dictionary which has an implicit type only and an explicit + * value. */ if (id != 0) /* Write the property and update the position to the next * property. */ - position += p.write(propertyStream); + position += p.write(propertyStream, getCodepage()); else { - final Integer codepage = - (Integer) getProperty(PropertyIDMap.PID_CODEPAGE); - if (codepage == null) + final int codepage = getCodepage(); + if (codepage == -1) throw new IllegalPropertySetDataException ("Codepage (property 1) is undefined."); position += writeDictionary(propertyStream, dictionary); diff --git a/src/java/org/apache/poi/hpsf/Property.java b/src/java/org/apache/poi/hpsf/Property.java index e4d70bd1c..2b04da852 100644 --- a/src/java/org/apache/poi/hpsf/Property.java +++ b/src/java/org/apache/poi/hpsf/Property.java @@ -62,9 +62,11 @@ */ package org.apache.poi.hpsf; +import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; +import org.apache.poi.util.HexDump; import org.apache.poi.util.LittleEndian; /** @@ -161,9 +163,13 @@ public class Property * @param length The property's type/value pair's length in bytes. * @param codepage The section's and thus the property's * codepage. It is needed only when reading string values. + * + * @exception UnsupportedEncodingException if the specified codepage is not + * supported */ public Property(final long id, final byte[] src, final long offset, final int length, final int codepage) + throws UnsupportedEncodingException { this.id = id; @@ -183,7 +189,7 @@ public class Property try { - value = VariantSupport.read(src, o, length, (int) type); + value = VariantSupport.read(src, o, length, (int) type, codepage); } catch (UnsupportedVariantTypeException ex) { @@ -382,8 +388,27 @@ public class Property b.append(getID()); b.append(", type: "); b.append(getType()); + final Object value = getValue(); b.append(", value: "); - b.append(getValue()); + b.append(value.toString()); + if (value instanceof String) + { + final String s = (String) value; + final int l = s.length(); + final byte[] bytes = new byte[l * 2]; + for (int i = 0; i < l; i++) + { + final char c = s.charAt(i); + final byte high = (byte) ((c & 0x00ff00) >> 8); + final byte low = (byte) ((c & 0x0000ff) >> 0); + bytes[i * 2] = high; + bytes[i * 2 + 1] = low; + } + final String hex = HexDump.dump(bytes, 0L, 0); + b.append(" ["); + b.append(hex); + b.append("]"); + } b.append(']'); return b.toString(); } diff --git a/src/java/org/apache/poi/hpsf/PropertySet.java b/src/java/org/apache/poi/hpsf/PropertySet.java index dae88e73c..a92c3b9fd 100644 --- a/src/java/org/apache/poi/hpsf/PropertySet.java +++ b/src/java/org/apache/poi/hpsf/PropertySet.java @@ -56,6 +56,7 @@ package org.apache.poi.hpsf; import java.io.IOException; import java.io.InputStream; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; @@ -300,9 +301,11 @@ public class PropertySet * @param length The length of the stream data. * @throws NoPropertySetStreamException if the byte array is not a * property set stream. + * + * @exception UnsupportedEncodingException if the codepage is not supported */ public PropertySet(final byte[] stream, final int offset, final int length) - throws NoPropertySetStreamException + throws NoPropertySetStreamException, UnsupportedEncodingException { if (isPropertySetStream(stream, offset, length)) init(stream, offset, length); @@ -321,8 +324,11 @@ public class PropertySet * complete byte array contents is the stream data. * @throws NoPropertySetStreamException if the byte array is not a * property set stream. + * + * @exception UnsupportedEncodingException if the codepage is not supported */ - public PropertySet(final byte[] stream) throws NoPropertySetStreamException + public PropertySet(final byte[] stream) + throws NoPropertySetStreamException, UnsupportedEncodingException { this(stream, 0, stream.length); } @@ -435,6 +441,7 @@ public class PropertySet * @param length Length of the property set stream. */ private void init(final byte[] src, final int offset, final int length) + throws UnsupportedEncodingException { /* FIXME (3): Ensure that at most "length" bytes are read. */ @@ -651,7 +658,7 @@ public class PropertySet final PropertySet ps = (PropertySet) o; int byteOrder1 = ps.getByteOrder(); int byteOrder2 = getByteOrder(); - ClassID classId1 = ps.getClassID(); + ClassID classID1 = ps.getClassID(); ClassID classID2 = getClassID(); int format1 = ps.getFormat(); int format2 = getFormat(); @@ -660,7 +667,7 @@ public class PropertySet int sectionCount1 = ps.getSectionCount(); int sectionCount2 = getSectionCount(); if (byteOrder1 != byteOrder2 || - !classId1.equals(classID2) || + !classID1.equals(classID2) || format1 != format2 || osVersion1 != osVersion2 || sectionCount1 != sectionCount2) diff --git a/src/java/org/apache/poi/hpsf/Section.java b/src/java/org/apache/poi/hpsf/Section.java index 2cc4cc148..97b30ea26 100644 --- a/src/java/org/apache/poi/hpsf/Section.java +++ b/src/java/org/apache/poi/hpsf/Section.java @@ -54,6 +54,7 @@ */ package org.apache.poi.hpsf; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; @@ -193,8 +194,12 @@ public class Section * @param src Contains the complete property set stream. * @param offset The position in the stream that points to the * section's format ID. + * + * @exception UnsupportedEncodingException if the section's codepage is not + * supported. */ public Section(final byte[] src, final int offset) + throws UnsupportedEncodingException { int o1 = offset; @@ -638,4 +643,18 @@ public class Section return dictionary; } + + + /** + *Gets the section's codepage, if any.
+ * + * @return The section's codepage if one is defined, else -1. + */ + public int getCodepage() + { + final Integer codepage = + (Integer) getProperty(PropertyIDMap.PID_CODEPAGE); + return codepage != null ? codepage.intValue() : -1; + } + } diff --git a/src/java/org/apache/poi/hpsf/TypeWriter.java b/src/java/org/apache/poi/hpsf/TypeWriter.java index 2433353f4..aed32cefa 100644 --- a/src/java/org/apache/poi/hpsf/TypeWriter.java +++ b/src/java/org/apache/poi/hpsf/TypeWriter.java @@ -185,7 +185,8 @@ public class TypeWriter * @exception IOException if an I/O error occurs */ public static void writeToStream(final OutputStream out, - final Property[] properties) + final Property[] properties, + final int codepage) throws IOException, UnsupportedVariantTypeException { /* If there are no properties don't write anything. */ @@ -207,7 +208,7 @@ public class TypeWriter final Property p = (Property) properties[i]; long type = p.getType(); writeUIntToStream(out, type); - VariantSupport.write(out, (int) type, p.getValue()); + VariantSupport.write(out, (int) type, p.getValue(), codepage); } } diff --git a/src/java/org/apache/poi/hpsf/VariantSupport.java b/src/java/org/apache/poi/hpsf/VariantSupport.java index 17892abd2..29360420d 100644 --- a/src/java/org/apache/poi/hpsf/VariantSupport.java +++ b/src/java/org/apache/poi/hpsf/VariantSupport.java @@ -64,6 +64,7 @@ package org.apache.poi.hpsf; import java.io.IOException; import java.io.OutputStream; +import java.io.UnsupportedEncodingException; import java.util.Date; import java.util.LinkedList; import java.util.List; @@ -163,17 +164,21 @@ public class VariantSupport extends Variant * @param length The length of the variant including the variant * type field * @param type The variant type to read + * @param codepage The codepage to use to write non-wide strings * @return A Java object that corresponds best to the variant * field. For example, a VT_I4 is returned as a {@link Long}, a * VT_LPSTR as a {@link String}. * @exception ReadingNotSupportedException if a property is to be written * who's variant type HPSF does not yet support + * @exception UnsupportedEncodingException if the specified codepage is not + * supported * * @see Variant */ public static Object read(final byte[] src, final int offset, - final int length, final long type) - throws ReadingNotSupportedException + final int length, final long type, + final int codepage) + throws ReadingNotSupportedException, UnsupportedEncodingException { Object value; int o1 = offset; @@ -221,18 +226,18 @@ public class VariantSupport extends Variant * Read a byte string. In Java it is represented as a * String object. The 0x00 bytes at the end must be * stripped. - * - * FIXME (2): Reading an 8-bit string should pay attention - * to the codepage. Currently the byte making out the - * property's value are interpreted according to the - * platform's default character set. */ final int first = o1 + LittleEndian.INT_SIZE; long last = first + LittleEndian.getUInt(src, o1) - 1; o1 += LittleEndian.INT_SIZE; + final int rawLength = (int) (last - first + 1); while (src[(int) last] == 0 && first <= last) last--; - value = new String(src, (int) first, (int) (last - first + 1)); + final int l = (int) (last - first + 1); + value = codepage != -1 ? + new String(src, (int) first, l, + codepageToEncoding(codepage)) : + new String(src, (int) first, l); break; } case Variant.VT_LPWSTR: @@ -298,6 +303,38 @@ public class VariantSupport extends Variant + /** + *Turns a codepage number into the equivalent character encoding's + * name.
+ * + * @param codepage The codepage number + * + * @return The character encoding's name. If the codepage number is 65001, + * the encoding name is "UTF-8". All other positive numbers are mapped to + * "cp" followed by the number, e.g. if the codepage number is 1252 the + * returned character encoding name will be "cp1252". + * + * @exception UnsupportedEncodingException if the specified codepage is + * less than zero. + */ + public static String codepageToEncoding(final int codepage) + throws UnsupportedEncodingException + { + if (codepage <= 0) + throw new UnsupportedEncodingException + ("Codepage number may not be " + codepage); + switch (codepage) + { + case 1200: + return "UTF-16"; + case 65001: + return "UTF-8"; + default: + return "cp" + codepage; + } + } + + /** *Writes a variant value to an output stream. This method ensures that * always a multiple of 4 bytes is written.
@@ -305,6 +342,7 @@ public class VariantSupport extends Variant * @param out The stream to write the value to. * @param type The variant's type. * @param value The variant's value. + * @param codepage The codepage to use to write non-wide strings * @return The number of entities that have been written. In many cases an * "entity" is a byte but this is not always the case. * @exception IOException if an I/O exceptions occurs @@ -312,7 +350,7 @@ public class VariantSupport extends Variant * who's variant type HPSF does not yet support */ public static int write(final OutputStream out, final long type, - final Object value) + final Object value, final int codepage) throws IOException, WritingNotSupportedException { int length = 0; @@ -330,16 +368,13 @@ public class VariantSupport extends Variant } case Variant.VT_LPSTR: { - length = TypeWriter.writeUIntToStream - (out, ((String) value).length() + 1); - char[] s = Util.pad4((String) value); - /* FIXME (2): The following line forces characters to bytes. - * This is generally wrong and should only be done according to - * a codepage. Alternatively Unicode could be written (see - * Variant.VT_LPWSTR). */ - byte[] b = new byte[s.length + 1]; - for (int i = 0; i < s.length; i++) - b[i] = (byte) s[i]; + final byte[] bytes = + (codepage == -1 ? + ((String) value).getBytes() : + ((String) value).getBytes(codepageToEncoding(codepage))); + length = TypeWriter.writeUIntToStream(out, bytes.length + 1); + final byte[] b = new byte[bytes.length + 1]; + System.arraycopy(bytes, 0, b, 0, bytes.length); b[b.length - 1] = 0x00; out.write(b); length += b.length; @@ -419,12 +454,13 @@ public class VariantSupport extends Variant } } - /* Add 0x00 character to write a multiple of four bytes: */ - while (length % 4 != 0) - { - out.write(0); - length++; - } + /* Add 0x00 characters to write a multiple of four bytes: */ + // FIXME (1) Try this! +// while (length % 4 != 0) +// { +// out.write(0); +// length++; +// } return length; } diff --git a/src/testcases/org/apache/poi/hpsf/basic/TestWrite.java b/src/testcases/org/apache/poi/hpsf/basic/TestWrite.java index d62378dff..95b78ea66 100644 --- a/src/testcases/org/apache/poi/hpsf/basic/TestWrite.java +++ b/src/testcases/org/apache/poi/hpsf/basic/TestWrite.java @@ -357,7 +357,10 @@ public class TestWrite extends TestCase catch (Exception ex) { ex.printStackTrace(); - throw new RuntimeException(ex); + throw new RuntimeException(ex.toString()); + /* FIXME (2): Replace the previous line by the following + * one once we no longer need JDK 1.3 compatibility. */ + // throw new RuntimeException(ex); } } }, @@ -398,37 +401,40 @@ public class TestWrite extends TestCase public void testVariantTypes() { Throwable t = null; + final int codepage = -1; + /* FIXME (2): Add tests for various codepages! */ try { - check(Variant.VT_EMPTY, null); - check(Variant.VT_BOOL, new Boolean(true)); - check(Variant.VT_BOOL, new Boolean(false)); - check(Variant.VT_CF, new byte[]{0}); - check(Variant.VT_CF, new byte[]{0, 1}); - check(Variant.VT_CF, new byte[]{0, 1, 2}); - check(Variant.VT_CF, new byte[]{0, 1, 2, 3}); - check(Variant.VT_CF, new byte[]{0, 1, 2, 3, 4}); - check(Variant.VT_CF, new byte[]{0, 1, 2, 3, 4, 5}); - check(Variant.VT_CF, new byte[]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); - check(Variant.VT_I2, new Integer(27)); - check(Variant.VT_I4, new Long(28)); - check(Variant.VT_FILETIME, new Date()); - check(Variant.VT_LPSTR, ""); - check(Variant.VT_LPSTR, "ä"); - check(Variant.VT_LPSTR, "äö"); - check(Variant.VT_LPSTR, "äöü"); - check(Variant.VT_LPSTR, "äöüÄ"); - check(Variant.VT_LPSTR, "äöüÄÖ"); - check(Variant.VT_LPSTR, "äöüÄÖÜ"); - check(Variant.VT_LPSTR, "äöüÄÖÜß"); - check(Variant.VT_LPWSTR, ""); - check(Variant.VT_LPWSTR, "ä"); - check(Variant.VT_LPWSTR, "äö"); - check(Variant.VT_LPWSTR, "äöü"); - check(Variant.VT_LPWSTR, "äöüÄ"); - check(Variant.VT_LPWSTR, "äöüÄÖ"); - check(Variant.VT_LPWSTR, "äöüÄÖÜ"); - check(Variant.VT_LPWSTR, "äöüÄÖÜß"); + check(Variant.VT_EMPTY, null, codepage); + check(Variant.VT_BOOL, new Boolean(true), codepage); + check(Variant.VT_BOOL, new Boolean(false), codepage); + check(Variant.VT_CF, new byte[]{0}, codepage); + check(Variant.VT_CF, new byte[]{0, 1}, codepage); + check(Variant.VT_CF, new byte[]{0, 1, 2}, codepage); + check(Variant.VT_CF, new byte[]{0, 1, 2, 3}, codepage); + check(Variant.VT_CF, new byte[]{0, 1, 2, 3, 4}, codepage); + check(Variant.VT_CF, new byte[]{0, 1, 2, 3, 4, 5}, codepage); + check(Variant.VT_CF, new byte[]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + codepage); + check(Variant.VT_I2, new Integer(27), codepage); + check(Variant.VT_I4, new Long(28), codepage); + check(Variant.VT_FILETIME, new Date(), codepage); + check(Variant.VT_LPSTR, "", codepage); + check(Variant.VT_LPSTR, "ä", codepage); + check(Variant.VT_LPSTR, "äö", codepage); + check(Variant.VT_LPSTR, "äöü", codepage); + check(Variant.VT_LPSTR, "äöüÄ", codepage); + check(Variant.VT_LPSTR, "äöüÄÖ", codepage); + check(Variant.VT_LPSTR, "äöüÄÖÜ", codepage); + check(Variant.VT_LPSTR, "äöüÄÖÜß", codepage); + check(Variant.VT_LPWSTR, "", codepage); + check(Variant.VT_LPWSTR, "ä", codepage); + check(Variant.VT_LPWSTR, "äö", codepage); + check(Variant.VT_LPWSTR, "äöü", codepage); + check(Variant.VT_LPWSTR, "äöüÄ", codepage); + check(Variant.VT_LPWSTR, "äöüÄÖ", codepage); + check(Variant.VT_LPWSTR, "äöüÄÖÜ", codepage); + check(Variant.VT_LPWSTR, "äöüÄÖÜß", codepage); } catch (Exception ex) { @@ -466,20 +472,22 @@ public class TestWrite extends TestCase * @throws UnsupportedVariantTypeException if the variant is not supported. * @throws IOException if an I/O exception occurs. */ - private void check(final long variantType, final Object value) + private void check(final long variantType, final Object value, + final int codepage) throws UnsupportedVariantTypeException, IOException { final ByteArrayOutputStream out = new ByteArrayOutputStream(); - VariantSupport.write(out, variantType, value); + VariantSupport.write(out, variantType, value, codepage); out.close(); final byte[] b = out.toByteArray(); final Object objRead = VariantSupport.read(b, 0, b.length + LittleEndian.INT_SIZE, - variantType); + variantType, -1); if (objRead instanceof byte[]) { - final int diff = diff(org.apache.poi.hpsf.Util.pad4 - ((byte[]) value), (byte[]) objRead); +// final int diff = diff(org.apache.poi.hpsf.Util.pad4 +// ((byte[]) value), (byte[]) objRead); + final int diff = diff((byte[]) value, (byte[]) objRead); if (diff >= 0) fail("Byte arrays are different. First different byte is at " + "index " + diff + "."); diff --git a/src/testcases/org/apache/poi/hpsf/data/TestChineseProperties.doc b/src/testcases/org/apache/poi/hpsf/data/TestChineseProperties.doc new file mode 100644 index 000000000..fd2b478e4 Binary files /dev/null and b/src/testcases/org/apache/poi/hpsf/data/TestChineseProperties.doc differ