poi/src/java/org/apache/poi/hpsf/VariantSupport.java

/* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
==================================================================== */

package org.apache.poi.hpsf;

import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;

import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.LittleEndianConsts;

/**
 * <p>Supports reading and writing of variant data.</p>
 *
 * <p><strong>FIXME (3):</strong> Reading and writing should be made more
 * uniform than it is now. The following items should be resolved:
 *
 * <ul>
 *
 * <li><p>Reading requires a length parameter that is 4 byte greater than the
 * actual data, because the variant type field is included. </p></li>
 *
 * <li><p>Reading reads from a byte array while writing writes to an byte array
 * output stream.</p></li>
 *
 * </ul>
 *
 * @author Rainer Klute <a
 * href="mailto:klute@rainer-klute.de">&lt;klute@rainer-klute.de&gt;</a>
 */
public class VariantSupport extends Variant
{

    private static boolean logUnsupportedTypes = false;

    /**
     * <p>Specifies whether warnings about unsupported variant types are to be
     * written to <code>System.err</code> or not.</p>
     *
     * @param logUnsupportedTypes If <code>true</code> warnings will be written,
     * if <code>false</code> they won't.
     */
    public static void setLogUnsupportedTypes(final boolean logUnsupportedTypes)
    {
        VariantSupport.logUnsupportedTypes = logUnsupportedTypes;
    }

    /**
     * <p>Checks whether logging of unsupported variant types warning is turned
     * on or off.</p>
     *
     * @return <code>true</code> if logging is turned on, else
     * <code>false</code>.
     */
    public static boolean isLogUnsupportedTypes()
    {
        return logUnsupportedTypes;
    }


    /**
     * <p>Keeps a list of the variant types an "unsupported" message has already
     * been issued for.</p>
     */
    protected static List unsupportedMessage;

    /**
     * <p>Writes a warning to <code>System.err</code> that a variant type is
     * unsupported by HPSF. Such a warning is written only once for each variant
     * type. Log messages can be turned on or off by </p>
     *
     * @param ex The exception to log
     */
    protected static void writeUnsupportedTypeMessage
        (final UnsupportedVariantTypeException ex)
    {
        if (isLogUnsupportedTypes())
        {
            if (unsupportedMessage == null)
                unsupportedMessage = new LinkedList();
            Long vt = new Long(ex.getVariantType());
            if (!unsupportedMessage.contains(vt))
            {
                System.err.println(ex.getMessage());
                unsupportedMessage.add(vt);
            }
        }
    }


    /**
     * <p>HPSF is able to read these {@link Variant} types.</p>
     */
    final static public int[] SUPPORTED_TYPES = { Variant.VT_EMPTY,
            Variant.VT_I2, Variant.VT_I4, Variant.VT_I8, Variant.VT_R8,
            Variant.VT_FILETIME, Variant.VT_LPSTR, Variant.VT_LPWSTR,
            Variant.VT_CF, Variant.VT_BOOL };


    /**
     * <p>Checks whether HPSF supports the specified variant type. Unsupported
     * types should be implemented included in the {@link #SUPPORTED_TYPES}
     * array.</p>
     *
     * @see Variant
     * @param variantType the variant type to check
     * @return <code>true</code> if HPFS supports this type, else
     *         <code>false</code>
     */
    public boolean isSupportedType(final int variantType)
    {
        for (int i = 0; i < SUPPORTED_TYPES.length; i++)
            if (variantType == SUPPORTED_TYPES[i])
                return true;
        return false;
    }


    /**
     * <p>Reads a variant type from a byte array.</p>
     *
     * @param src The byte array
     * @param offset The offset in the byte array where the variant starts
     * @param length The length of the variant including the variant type field
     * @param type The variant type to read
     * @param codepage The codepage to use for non-wide strings
     * @return A Java object that corresponds best to the variant field. For
     *         example, a VT_I4 is returned as a {@link Long}, a VT_LPSTR as a
     *         {@link String}.
     * @exception ReadingNotSupportedException if a property is to be written
     *            who's variant type HPSF does not yet support
     * @exception UnsupportedEncodingException if the specified codepage is not
     *            supported.
     * @see Variant
     */
    public static Object read(final byte[] src, final int offset,
                              final int length, final long type,
                              final int codepage)
    throws ReadingNotSupportedException, UnsupportedEncodingException
    {
        Object value;
        int o1 = offset;
        int l1 = length - LittleEndian.INT_SIZE;
        long lType = type;

        /* Instead of trying to read 8-bit characters from a Unicode string,
         * read 16-bit characters. */
        if (codepage == Constants.CP_UNICODE && type == Variant.VT_LPSTR)
            lType = Variant.VT_LPWSTR;

        switch ((int) lType)
        {
            case Variant.VT_EMPTY:
            {
                value = null;
                break;
            }
            case Variant.VT_I2:
            {
                /*
                 * Read a short. In Java it is represented as an
                 * Integer object.
                 */
                value = new Integer(LittleEndian.getShort(src, o1));
                break;
            }
            case Variant.VT_I4:
            {
                /*
                 * Read a word. In Java it is represented as an
                 * Integer object.
                 */
                value = new Integer(LittleEndian.getInt(src, o1));
                break;
            }
            case Variant.VT_I8:
            {
                /*
                 * Read a double word. In Java it is represented as a
                 * Long object.
                 */
                value = new Long(LittleEndian.getLong(src, o1));
                break;
            }
            case Variant.VT_R8:
            {
                /*
                 * Read an eight-byte double value. In Java it is represented as
                 * a Double object.
                 */
                value = new Double(LittleEndian.getDouble(src, o1));
                break;
            }
            case Variant.VT_FILETIME:
            {
                /*
                 * Read a FILETIME object. In Java it is represented
                 * as a Date object.
                 */
                final long low = LittleEndian.getUInt(src, o1);
                o1 += LittleEndian.INT_SIZE;
                final long high = LittleEndian.getUInt(src, o1);
                value = Util.filetimeToDate((int) high, (int) low);
                break;
            }
            case Variant.VT_LPSTR:
            {
                /*
                 * Read a byte string. In Java it is represented as a
                 * String object. The 0x00 bytes at the end must be
                 * stripped.
                 */
                final int first = o1 + LittleEndian.INT_SIZE;
                long last = first + LittleEndian.getUInt(src, o1) - 1;
                o1 += LittleEndian.INT_SIZE;
                while (src[(int) last] == 0 && first <= last)
                    last--;
                final int l = (int) (last - first + 1);
                value = codepage != -1 ?
                    new String(src, first, l,
                               codepageToEncoding(codepage)) :
                    new String(src, first, l);
                break;
            }
            case Variant.VT_LPWSTR:
            {
                /*
                 * Read a Unicode string. In Java it is represented as
                 * a String object. The 0x00 bytes at the end must be
                 * stripped.
                 */
                final int first = o1 + LittleEndian.INT_SIZE;
                long last = first + LittleEndian.getUInt(src, o1) - 1;
                long l = last - first;
                o1 += LittleEndian.INT_SIZE;
                StringBuffer b = new StringBuffer((int) (last - first));
                for (int i = 0; i <= l; i++)
                {
                    final int i1 = o1 + (i * 2);
                    final int i2 = i1 + 1;
                    final int high = src[i2] << 8;
                    final int low = src[i1] & 0x00ff;
                    final char c = (char) (high | low);
                    b.append(c);
                }
                /* Strip 0x00 characters from the end of the string: */
                while (b.length() > 0 && b.charAt(b.length() - 1) == 0x00)
                    b.setLength(b.length() - 1);
                value = b.toString();
                break;
            }
            case Variant.VT_CF:
            {
                if(l1 < 0) {
                    /**
                     *  YK: reading the ClipboardData packet (VT_CF) is not quite correct.
                     *  The size of the data is determined by the first four bytes of the packet
                     *  while the current implementation calculates it in the Section constructor.
                     *  Test files in Bugzilla 42726 and 45583 clearly show that this approach does not always work.
                     *  The workaround below attempts to gracefully handle such cases instead of throwing exceptions.
                     *
                     *  August 20, 2009
                     */
                    l1 = LittleEndian.getInt(src, o1); o1 += LittleEndian.INT_SIZE;
                }
                final byte[] v = new byte[l1];
                System.arraycopy(src, o1, v, 0, v.length);
                value = v;
                break;
            }
            case Variant.VT_BOOL:
            {
                /*
                 * The first four bytes in src, from src[offset] to
                 * src[offset + 3] contain the DWord for VT_BOOL, so
                 * skip it, we don't need it.
                 */
                // final int first = offset + LittleEndian.INT_SIZE;
                long bool = LittleEndian.getUInt(src, o1);
                if (bool != 0)
                    value = Boolean.TRUE;
                else
                    value = Boolean.FALSE;
                break;
            }
            default:
            {
                final byte[] v = new byte[l1];
                for (int i = 0; i < l1; i++)
                    v[i] = src[(o1 + i)];
                throw new ReadingNotSupportedException(type, v);
            }
        }
        return value;
    }


    /**
     * <p>Turns a codepage number into the equivalent character encoding's
     * name.</p>
     *
     * @param codepage The codepage number
     *
     * @return The character encoding's name. If the codepage number is 65001,
     * the encoding name is "UTF-8". All other positive numbers are mapped to
     * "cp" followed by the number, e.g. if the codepage number is 1252 the
     * returned character encoding name will be "cp1252".
     *
     * @exception UnsupportedEncodingException if the specified codepage is
     * less than zero.
     */
    public static String codepageToEncoding(final int codepage)
    throws UnsupportedEncodingException
    {
        if (codepage <= 0)
            throw new UnsupportedEncodingException
                ("Codepage number may not be " + codepage);
        switch (codepage)
        {
            case Constants.CP_UTF16:
                return "UTF-16";
            case Constants.CP_UTF16_BE:
                return "UTF-16BE";
            case Constants.CP_UTF8:
                return "UTF-8";
            case Constants.CP_037:
                return "cp037";
            case Constants.CP_GBK:
                return "GBK";
            case Constants.CP_MS949:
                return "ms949";
            case Constants.CP_WINDOWS_1250:
                return "windows-1250";
            case Constants.CP_WINDOWS_1251:
                return "windows-1251";
            case Constants.CP_WINDOWS_1252:
                return "windows-1252";
            case Constants.CP_WINDOWS_1253:
                return "windows-1253";
            case Constants.CP_WINDOWS_1254:
                return "windows-1254";
            case Constants.CP_WINDOWS_1255:
                return "windows-1255";
            case Constants.CP_WINDOWS_1256:
                return "windows-1256";
            case Constants.CP_WINDOWS_1257:
                return "windows-1257";
            case Constants.CP_WINDOWS_1258:
                return "windows-1258";
            case Constants.CP_JOHAB:
                return "johab";
            case Constants.CP_MAC_ROMAN:
                return "MacRoman";
            case Constants.CP_MAC_JAPAN:
                return "SJIS";
            case Constants.CP_MAC_CHINESE_TRADITIONAL:
                return "Big5";
            case Constants.CP_MAC_KOREAN:
                return "EUC-KR";
            case Constants.CP_MAC_ARABIC:
                return "MacArabic";
            case Constants.CP_MAC_HEBREW:
                return "MacHebrew";
            case Constants.CP_MAC_GREEK:
                return "MacGreek";
            case Constants.CP_MAC_CYRILLIC:
                return "MacCyrillic";
            case Constants.CP_MAC_CHINESE_SIMPLE:
                return "EUC_CN";
            case Constants.CP_MAC_ROMANIA:
                return "MacRomania";
            case Constants.CP_MAC_UKRAINE:
                return "MacUkraine";
            case Constants.CP_MAC_THAI:
                return "MacThai";
            case Constants.CP_MAC_CENTRAL_EUROPE:
                return "MacCentralEurope";
            case Constants.CP_MAC_ICELAND:
                  return "MacIceland";
            case Constants.CP_MAC_TURKISH:
                return "MacTurkish";
            case Constants.CP_MAC_CROATIAN:
                return "MacCroatian";
            case Constants.CP_US_ACSII:
            case Constants.CP_US_ASCII2:
                return "US-ASCII";
            case Constants.CP_KOI8_R:
                return "KOI8-R";
            case Constants.CP_ISO_8859_1:
                return "ISO-8859-1";
            case Constants.CP_ISO_8859_2:
                return "ISO-8859-2";
            case Constants.CP_ISO_8859_3:
                return "ISO-8859-3";
            case Constants.CP_ISO_8859_4:
                return "ISO-8859-4";
            case Constants.CP_ISO_8859_5:
                return "ISO-8859-5";
            case Constants.CP_ISO_8859_6:
                return "ISO-8859-6";
            case Constants.CP_ISO_8859_7:
                return "ISO-8859-7";
            case Constants.CP_ISO_8859_8:
                return "ISO-8859-8";
            case Constants.CP_ISO_8859_9:
                return "ISO-8859-9";
            case Constants.CP_ISO_2022_JP1:
            case Constants.CP_ISO_2022_JP2:
            case Constants.CP_ISO_2022_JP3:
                return "ISO-2022-JP";
            case Constants.CP_ISO_2022_KR:
                return "ISO-2022-KR";
            case Constants.CP_EUC_JP:
                return "EUC-JP";
            case Constants.CP_EUC_KR:
                return "EUC-KR";
            case Constants.CP_GB2312:
                return "GB2312";
            case Constants.CP_GB18030:
                return "GB18030";
            case Constants.CP_SJIS:
                return "SJIS";
            default:
                return "cp" + codepage;
        }
    }


    /**
     * <p>Writes a variant value to an output stream. This method ensures that
     * always a multiple of 4 bytes is written.</p>
     *
     * <p>If the codepage is UTF-16, which is encouraged, strings
     * <strong>must</strong> always be written as {@link Variant#VT_LPWSTR}
     * strings, not as {@link Variant#VT_LPSTR} strings. This method ensure this
     * by converting strings appropriately, if needed.</p>
     *
     * @param out The stream to write the value to.
     * @param type The variant's type.
     * @param value The variant's value.
     * @param codepage The codepage to use to write non-wide strings
     * @return The number of entities that have been written. In many cases an
     * "entity" is a byte but this is not always the case.
     * @exception IOException if an I/O exceptions occurs
     * @exception WritingNotSupportedException if a property is to be written
     * who's variant type HPSF does not yet support
     */
    public static int write(final OutputStream out, final long type,
                            final Object value, final int codepage)
        throws IOException, WritingNotSupportedException
    {
        int length = 0;
        switch ((int) type)
        {
            case Variant.VT_BOOL:
            {
                int trueOrFalse;
                if (((Boolean) value).booleanValue())
                    trueOrFalse = 1;
                else
                    trueOrFalse = 0;
                length = TypeWriter.writeUIntToStream(out, trueOrFalse);
                break;
            }
            case Variant.VT_LPSTR:
            {
                final byte[] bytes =
                    (codepage == -1 ?
                    ((String) value).getBytes() :
                    ((String) value).getBytes(codepageToEncoding(codepage)));
                length = TypeWriter.writeUIntToStream(out, bytes.length + 1);
                final byte[] b = new byte[bytes.length + 1];
                System.arraycopy(bytes, 0, b, 0, bytes.length);
                b[b.length - 1] = 0x00;
                out.write(b);
                length += b.length;
                break;
            }
            case Variant.VT_LPWSTR:
            {
                final int nrOfChars = ((String) value).length() + 1;
                length += TypeWriter.writeUIntToStream(out, nrOfChars);
                char[] s = Util.pad4((String) value);
                for (int i = 0; i < s.length; i++)
                {
                    final int high = ((s[i] & 0x0000ff00) >> 8);
                    final int low = (s[i] & 0x000000ff);
                    final byte highb = (byte) high;
                    final byte lowb = (byte) low;
                    out.write(lowb);
                    out.write(highb);
                    length += 2;
                }
                out.write(0x00);
                out.write(0x00);
                length += 2;
                break;
            }
            case Variant.VT_CF:
            {
                final byte[] b = (byte[]) value;
                out.write(b);
                length = b.length;
                break;
            }
            case Variant.VT_EMPTY:
            {
                TypeWriter.writeUIntToStream(out, Variant.VT_EMPTY);
                length = LittleEndianConsts.INT_SIZE;
                break;
            }
            case Variant.VT_I2:
            {
                TypeWriter.writeToStream(out, ((Integer) value).shortValue());
                length = LittleEndianConsts.SHORT_SIZE;
                break;
            }
            case Variant.VT_I4:
            {
                if (!(value instanceof Integer))
                {
                    throw new ClassCastException("Could not cast an object to "
                            + Integer.class.toString() + ": "
                            + value.getClass().toString() + ", "
                            + value.toString());
                }
                length += TypeWriter.writeToStream(out,
                          ((Integer) value).intValue());
                break;
            }
            case Variant.VT_I8:
            {
                TypeWriter.writeToStream(out, ((Long) value).longValue());
                length = LittleEndianConsts.LONG_SIZE;
                break;
            }
            case Variant.VT_R8:
            {
                length += TypeWriter.writeToStream(out,
                          ((Double) value).doubleValue());
                break;
            }
            case Variant.VT_FILETIME:
            {
                long filetime = Util.dateToFileTime((Date) value);
                int high = (int) ((filetime >> 32) & 0x00000000FFFFFFFFL);
                int low = (int) (filetime & 0x00000000FFFFFFFFL);
                length += TypeWriter.writeUIntToStream
                    (out, 0x0000000FFFFFFFFL & low);
                length += TypeWriter.writeUIntToStream
                    (out, 0x0000000FFFFFFFFL & high);
                break;
            }
            default:
            {
                /* The variant type is not supported yet. However, if the value
                 * is a byte array we can write it nevertheless. */
                if (value instanceof byte[])
                {
                    final byte[] b = (byte[]) value;
                    out.write(b);
                    length = b.length;
                    writeUnsupportedTypeMessage
                        (new WritingNotSupportedException(type, value));
                }
                else
                    throw new WritingNotSupportedException(type, value);
                break;
            }
        }

        return length;
    }

}