VBA extraction support from bug #52949 from Barry Lagerweij

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1738418 13f79535-47bb-0310-9956-ffa450edef68
2016-04-10 11:16:49 +00:00 · 2016-04-10 11:16:49 +00:00 · 4abcc6626a
commit 4abcc6626a
parent 2a0ed81538
2 changed files with 461 additions and 0 deletions
--- a/src/java/org/apache/poi/poifs/macros/VBAMacroExtractor.java
+++ b/src/java/org/apache/poi/poifs/macros/VBAMacroExtractor.java
@ -0,0 +1,188 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.poifs.macros;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.RLEDecompressingInputStream;
+
+/**
+ * This class is able to extract the source of all VBA Modules of an Excel file.
+ */
+public class VBAMacroExtractor {
+
+    /**
+     * Extract macros from XLSM or XLS file. Automatically detects ZIP (XLSM, DOCX, etc) files.
+     * 
+     * @param in
+     * @return
+     * @throws IOException
+     */
+    public Map<String, String> extractMacros(InputStream in) throws IOException {
+        PushbackInputStream bpin = new PushbackInputStream(in, 2);
+        byte[] header = new byte[2];
+        if (bpin.read(header) != 2) {
+            throw new IllegalArgumentException("Invalid InputStream: cannot read 2 bytes");
+        }
+        bpin.unread(header);
+        if (header[0] == 'P' && header[1] == 'K') {
+            ZipInputStream zis = new ZipInputStream(bpin);
+            ZipEntry zipEntry;
+            while ((zipEntry = zis.getNextEntry()) != null) {
+                if ("xl/vbaProject.bin".equals(zipEntry.getName())) {
+                    try {
+                        return extractMacrosFromPOIFSInputStream(zis);
+                    } finally {
+                        zis.closeEntry();
+                    }
+                }
+            }
+            return null;
+        } else {
+            return extractMacrosFromPOIFSInputStream(bpin);
+        }
+    }
+
+    /**
+     * Extracts all macros from all modules of the provided input stream. The stream is assumed to be in POIFS format (i.e. XLS file itself or
+     * vbaProject.bin from OOXML files)
+     * 
+     * @param in
+     * @return
+     * @throws IOException
+     */
+    public Map<String, String> extractMacrosFromPOIFSInputStream(InputStream in) throws IOException {
+        class Module {
+
+            Integer offset;
+            byte[] buf;
+        }
+        class ModuleMap extends HashMap<String, Module> {
+
+            Charset charset = Charset.forName("Cp1252"); // default charset
+        }
+        try {
+            final ModuleMap modules = new ModuleMap();
+            POIFSReader dirReader = new POIFSReader();
+            dirReader.registerListener(new POIFSReaderListener() {
+
+                public void processPOIFSReaderEvent(POIFSReaderEvent event) {
+                    try {
+                        String name = event.getName();
+                        if (event.getPath().toString().endsWith("\\VBA")) {
+                            if ("dir".equals(name)) {
+                                // process DIR
+                                RLEDecompressingInputStream in = new RLEDecompressingInputStream(event.getStream());
+                                String streamName = null;
+                                while (true) {
+                                    int id = in.readShort();
+                                    if (id == -1 || id == 0x0010) {
+                                        break; // EOF or TERMINATOR
+                                    }
+                                    int len = in.readInt();
+                                    switch (id) {
+                                        case 0x0009: // PROJECTVERSION
+                                            in.skip(6);
+                                            break;
+                                        case 0x0003: // PROJECTCODEPAGE
+                                            int codepage = in.readShort();
+                                            modules.charset = Charset.forName("Cp" + codepage);
+                                            break;
+                                        case 0x001A: // STREAMNAME
+                                            byte[] streamNameBuf = new byte[len];
+                                            int count = in.read(streamNameBuf);
+                                            streamName = new String(streamNameBuf, 0, count, modules.charset);
+                                            break;
+                                        case 0x0031: // MODULEOFFSET
+                                            int moduleOffset = in.readInt();
+                                            Module module = modules.get(streamName);
+                                            if (module != null) {
+                                                ByteArrayOutputStream out = new ByteArrayOutputStream();
+                                                RLEDecompressingInputStream stream = new RLEDecompressingInputStream(new ByteArrayInputStream(
+                                                        module.buf, moduleOffset, module.buf.length - moduleOffset));
+                                                IOUtils.copy(stream, out);
+                                                stream.close();
+                                                out.close();
+                                                module.buf = out.toByteArray();
+                                            } else {
+                                                module = new Module();
+                                                module.offset = moduleOffset;
+                                                modules.put(streamName, module);
+                                            }
+                                            break;
+                                        default:
+                                            in.skip(len);
+                                            break;
+                                    }
+                                }
+                            } else if (!name.startsWith("__SRP") && !name.startsWith("_VBA_PROJECT")) {
+                                // process module, skip __SRP and _VBA_PROJECT since these do not contain macros
+                                Module module = modules.get(name);
+                                final DocumentInputStream stream = event.getStream();
+                                final InputStream in;
+                                if (module == null) {
+                                    // no DIR stream with offsets yet, so store the compressed bytes for later
+                                    module = new Module();
+                                    modules.put(name, module);
+                                    in = stream;
+                                } else {
+                                    // we know the offset already, so decompress immediately on-the-fly
+                                    stream.skip(module.offset);
+                                    in = new RLEDecompressingInputStream(stream);
+                                }
+                                final ByteArrayOutputStream out = new ByteArrayOutputStream();
+                                IOUtils.copy(in, out);
+                                in.close();
+                                out.close();
+                                module.buf = out.toByteArray();
+                            }
+                        }
+                    } catch (IOException e) {
+                        throw new RuntimeException(e);
+                    }
+                }
+            });
+            dirReader.read(in);
+            Map<String, String> moduleSources = new HashMap<String, String>();
+            for (Map.Entry<String, Module> entry : modules.entrySet()) {
+                Module module = entry.getValue();
+                if (module.buf != null && module.buf.length > 0) { // Skip empty modules
+                    moduleSources.put(entry.getKey(), new String(module.buf, modules.charset));
+                }
+            }
+            return moduleSources;
+        } catch (IOException e) {
+            e.printStackTrace();
+            throw e;
+        }
+    }
+}
--- a/src/java/org/apache/poi/util/RLEDecompressingInputStream.java
+++ b/src/java/org/apache/poi/util/RLEDecompressingInputStream.java
@ -0,0 +1,273 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.util;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Wrapper of InputStream which provides Run Length Encoding (RLE) 
+ *  decompression on the fly. Uses MS-OVBA decompression algorithm. See
+ * http://download.microsoft.com/download/2/4/8/24862317-78F0-4C4B-B355-C7B2C1D997DB/[MS-OVBA].pdf
+ */
+public class RLEDecompressingInputStream extends InputStream {
+
+    /**
+     * Bitmasks for performance
+     */
+    private static final int[] POWER2 = new int[] { 0x0001, // 0
+            0x0002, // 1
+            0x0004, // 2
+            0x0008, // 3
+            0x0010, // 4
+            0x0020, // 5
+            0x0040, // 6
+            0x0080, // 7
+            0x0100, // 8
+            0x0200, // 9
+            0x0400, // 10
+            0x0800, // 11
+            0x1000, // 12
+            0x2000, // 13
+            0x4000, // 14
+            0x8000 // 15
+    };
+
+    /** the wrapped inputstream */
+    private InputStream in;
+
+    /** a byte buffer with size 4096 for storing a single chunk */
+    private byte[] buf;
+
+    /** the current position in the byte buffer for reading */
+    private int pos;
+
+    /** the number of bytes in the byte buffer */
+    private int len;
+
+    /**
+     * Creates a new wrapper RLE Decompression InputStream.
+     * 
+     * @param in
+     * @throws IOException
+     */
+    public RLEDecompressingInputStream(InputStream in) throws IOException {
+        this.in = in;
+        buf = new byte[4096];
+        pos = 0;
+        int header = in.read();
+        if (header != 0x01) {
+            throw new IllegalArgumentException(String.format("Header byte 0x01 expected, received 0x%02X", header & 0xFF));
+        }
+        len = readChunk();
+    }
+
+    @Override
+    public int read() throws IOException {
+        if (len == -1) {
+            return -1;
+        }
+        if (pos >= len) {
+            if ((len = readChunk()) == -1) {
+                return -1;
+            }
+        }
+        return buf[pos++];
+    }
+
+    @Override
+    public int read(byte[] b) throws IOException {
+        return read(b, 0, b.length);
+    }
+
+    @Override
+    public int read(byte[] b, int off, int l) throws IOException {
+        if (len == -1) {
+            return -1;
+        }
+        int offset = off;
+        int length = l;
+        while (length > 0) {
+            if (pos >= len) {
+                if ((len = readChunk()) == -1) {
+                    return offset > off ? offset - off : -1;
+                }
+            }
+            int c = Math.min(length, len - pos);
+            System.arraycopy(buf, pos, b, offset, c);
+            pos += c;
+            length -= c;
+            offset += c;
+        }
+        return l;
+    }
+
+    @Override
+    public long skip(long n) throws IOException {
+        long length = n;
+        while (length > 0) {
+            if (pos >= len) {
+                if ((len = readChunk()) == -1) {
+                    return -1;
+                }
+            }
+            int c = (int) Math.min(n, len - pos);
+            pos += c;
+            length -= c;
+        }
+        return n;
+    }
+
+    @Override
+    public int available() {
+        return (len > 0 ? len - pos : 0);
+    }
+
+    @Override
+    public void close() throws IOException {
+        in.close();
+    }
+
+    /**
+     * Reads a single chunk from the underlying inputstream.
+     * 
+     * @return
+     * @throws IOException
+     */
+    private int readChunk() throws IOException {
+        pos = 0;
+        int w = readShort(in);
+        if (w == -1) {
+            return -1;
+        }
+        int chunkSize = (w & 0x0FFF) + 1; // plus 3 bytes minus 2 for the length
+        if ((w & 0x7000) != 0x3000) {
+            throw new IllegalArgumentException(String.format("Chunksize header A should be 0x3000, received 0x%04X", w & 0xE000));
+        }
+        boolean rawChunk = (w & 0x8000) == 0;
+        if (rawChunk) {
+            if (in.read(buf, 0, chunkSize) < chunkSize) {
+                throw new IllegalStateException(String.format("Not enough bytes read, expected %d", chunkSize));
+            }
+            return chunkSize;
+        } else {
+            int inOffset = 0;
+            int outOffset = 0;
+            while (inOffset < chunkSize) {
+                int tokenFlags = in.read();
+                inOffset++;
+                if (tokenFlags == -1) {
+                    break;
+                }
+                for (int n = 0; n < 8; n++) {
+                    if (inOffset >= chunkSize) {
+                        break;
+                    }
+                    if ((tokenFlags & POWER2[n]) == 0) {
+                        // literal
+                        final int b = in.read();
+                        if (b == -1) {
+                            return -1;
+                        }
+                        buf[outOffset++] = (byte) b;
+                        inOffset++;
+                    } else {
+                        // compressed token
+                        int token = readShort(in);
+                        if (token == -1) {
+                            return -1;
+                        }
+                        inOffset += 2;
+                        int copyLenBits = getCopyLenBits(outOffset - 1);
+                        int copyOffset = (token >> (copyLenBits)) + 1;
+                        int copyLen = (token & (POWER2[copyLenBits] - 1)) + 3;
+                        int startPos = outOffset - copyOffset;
+                        int endPos = startPos + copyLen;
+                        for (int i = startPos; i < endPos; i++) {
+                            buf[outOffset++] = buf[i];
+                        }
+                    }
+                }
+            }
+            return outOffset;
+        }
+    }
+
+    /**
+     * Helper method to determine how many bits in the CopyToken are used for the CopyLength.
+     * 
+     * @param offset
+     * @return
+     */
+    static int getCopyLenBits(int offset) {
+        for (int n = 11; n >= 4; n--) {
+            if ((offset & POWER2[n]) != 0) {
+                return 15 - n;
+            }
+        }
+        return 12;
+    }
+
+    /**
+     * Convenience method for read a 2-bytes short in little endian encoding.
+     * 
+     * @return
+     * @throws IOException
+     */
+    public int readShort() throws IOException {
+        return readShort(this);
+    }
+
+    /**
+     * Convenience method for read a 4-bytes int in little endian encoding.
+     * 
+     * @return
+     * @throws IOException
+     */
+    public int readInt() throws IOException {
+        return readInt(this);
+    }
+
+    private int readShort(InputStream stream) throws IOException {
+        int b0, b1;
+        if ((b0 = stream.read()) == -1) {
+            return -1;
+        }
+        if ((b1 = stream.read()) == -1) {
+            return -1;
+        }
+        return (b0 & 0xFF) | ((b1 & 0xFF) << 8);
+    }
+
+    private int readInt(InputStream stream) throws IOException {
+        int b0, b1, b2, b3;
+        if ((b0 = stream.read()) == -1) {
+            return -1;
+        }
+        if ((b1 = stream.read()) == -1) {
+            return -1;
+        }
+        if ((b2 = stream.read()) == -1) {
+            return -1;
+        }
+        if ((b3 = stream.read()) == -1) {
+            return -1;
+        }
+        return (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
+    }
+}