VBA extraction support from bug #52949 from Barry Lagerweij

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1738418 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2016-04-10 11:16:49 +00:00
parent 2a0ed81538
commit 4abcc6626a
2 changed files with 461 additions and 0 deletions

View File

@ -0,0 +1,188 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.poifs.macros;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.RLEDecompressingInputStream;
/**
* This class is able to extract the source of all VBA Modules of an Excel file.
*/
public class VBAMacroExtractor {
/**
* Extract macros from XLSM or XLS file. Automatically detects ZIP (XLSM, DOCX, etc) files.
*
* @param in
* @return
* @throws IOException
*/
public Map<String, String> extractMacros(InputStream in) throws IOException {
PushbackInputStream bpin = new PushbackInputStream(in, 2);
byte[] header = new byte[2];
if (bpin.read(header) != 2) {
throw new IllegalArgumentException("Invalid InputStream: cannot read 2 bytes");
}
bpin.unread(header);
if (header[0] == 'P' && header[1] == 'K') {
ZipInputStream zis = new ZipInputStream(bpin);
ZipEntry zipEntry;
while ((zipEntry = zis.getNextEntry()) != null) {
if ("xl/vbaProject.bin".equals(zipEntry.getName())) {
try {
return extractMacrosFromPOIFSInputStream(zis);
} finally {
zis.closeEntry();
}
}
}
return null;
} else {
return extractMacrosFromPOIFSInputStream(bpin);
}
}
/**
* Extracts all macros from all modules of the provided input stream. The stream is assumed to be in POIFS format (i.e. XLS file itself or
* vbaProject.bin from OOXML files)
*
* @param in
* @return
* @throws IOException
*/
public Map<String, String> extractMacrosFromPOIFSInputStream(InputStream in) throws IOException {
class Module {
Integer offset;
byte[] buf;
}
class ModuleMap extends HashMap<String, Module> {
Charset charset = Charset.forName("Cp1252"); // default charset
}
try {
final ModuleMap modules = new ModuleMap();
POIFSReader dirReader = new POIFSReader();
dirReader.registerListener(new POIFSReaderListener() {
public void processPOIFSReaderEvent(POIFSReaderEvent event) {
try {
String name = event.getName();
if (event.getPath().toString().endsWith("\\VBA")) {
if ("dir".equals(name)) {
// process DIR
RLEDecompressingInputStream in = new RLEDecompressingInputStream(event.getStream());
String streamName = null;
while (true) {
int id = in.readShort();
if (id == -1 || id == 0x0010) {
break; // EOF or TERMINATOR
}
int len = in.readInt();
switch (id) {
case 0x0009: // PROJECTVERSION
in.skip(6);
break;
case 0x0003: // PROJECTCODEPAGE
int codepage = in.readShort();
modules.charset = Charset.forName("Cp" + codepage);
break;
case 0x001A: // STREAMNAME
byte[] streamNameBuf = new byte[len];
int count = in.read(streamNameBuf);
streamName = new String(streamNameBuf, 0, count, modules.charset);
break;
case 0x0031: // MODULEOFFSET
int moduleOffset = in.readInt();
Module module = modules.get(streamName);
if (module != null) {
ByteArrayOutputStream out = new ByteArrayOutputStream();
RLEDecompressingInputStream stream = new RLEDecompressingInputStream(new ByteArrayInputStream(
module.buf, moduleOffset, module.buf.length - moduleOffset));
IOUtils.copy(stream, out);
stream.close();
out.close();
module.buf = out.toByteArray();
} else {
module = new Module();
module.offset = moduleOffset;
modules.put(streamName, module);
}
break;
default:
in.skip(len);
break;
}
}
} else if (!name.startsWith("__SRP") && !name.startsWith("_VBA_PROJECT")) {
// process module, skip __SRP and _VBA_PROJECT since these do not contain macros
Module module = modules.get(name);
final DocumentInputStream stream = event.getStream();
final InputStream in;
if (module == null) {
// no DIR stream with offsets yet, so store the compressed bytes for later
module = new Module();
modules.put(name, module);
in = stream;
} else {
// we know the offset already, so decompress immediately on-the-fly
stream.skip(module.offset);
in = new RLEDecompressingInputStream(stream);
}
final ByteArrayOutputStream out = new ByteArrayOutputStream();
IOUtils.copy(in, out);
in.close();
out.close();
module.buf = out.toByteArray();
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
dirReader.read(in);
Map<String, String> moduleSources = new HashMap<String, String>();
for (Map.Entry<String, Module> entry : modules.entrySet()) {
Module module = entry.getValue();
if (module.buf != null && module.buf.length > 0) { // Skip empty modules
moduleSources.put(entry.getKey(), new String(module.buf, modules.charset));
}
}
return moduleSources;
} catch (IOException e) {
e.printStackTrace();
throw e;
}
}
}

View File

@ -0,0 +1,273 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.util;
import java.io.IOException;
import java.io.InputStream;
/**
* Wrapper of InputStream which provides Run Length Encoding (RLE)
* decompression on the fly. Uses MS-OVBA decompression algorithm. See
* http://download.microsoft.com/download/2/4/8/24862317-78F0-4C4B-B355-C7B2C1D997DB/[MS-OVBA].pdf
*/
public class RLEDecompressingInputStream extends InputStream {
/**
* Bitmasks for performance
*/
private static final int[] POWER2 = new int[] { 0x0001, // 0
0x0002, // 1
0x0004, // 2
0x0008, // 3
0x0010, // 4
0x0020, // 5
0x0040, // 6
0x0080, // 7
0x0100, // 8
0x0200, // 9
0x0400, // 10
0x0800, // 11
0x1000, // 12
0x2000, // 13
0x4000, // 14
0x8000 // 15
};
/** the wrapped inputstream */
private InputStream in;
/** a byte buffer with size 4096 for storing a single chunk */
private byte[] buf;
/** the current position in the byte buffer for reading */
private int pos;
/** the number of bytes in the byte buffer */
private int len;
/**
* Creates a new wrapper RLE Decompression InputStream.
*
* @param in
* @throws IOException
*/
public RLEDecompressingInputStream(InputStream in) throws IOException {
this.in = in;
buf = new byte[4096];
pos = 0;
int header = in.read();
if (header != 0x01) {
throw new IllegalArgumentException(String.format("Header byte 0x01 expected, received 0x%02X", header & 0xFF));
}
len = readChunk();
}
@Override
public int read() throws IOException {
if (len == -1) {
return -1;
}
if (pos >= len) {
if ((len = readChunk()) == -1) {
return -1;
}
}
return buf[pos++];
}
@Override
public int read(byte[] b) throws IOException {
return read(b, 0, b.length);
}
@Override
public int read(byte[] b, int off, int l) throws IOException {
if (len == -1) {
return -1;
}
int offset = off;
int length = l;
while (length > 0) {
if (pos >= len) {
if ((len = readChunk()) == -1) {
return offset > off ? offset - off : -1;
}
}
int c = Math.min(length, len - pos);
System.arraycopy(buf, pos, b, offset, c);
pos += c;
length -= c;
offset += c;
}
return l;
}
@Override
public long skip(long n) throws IOException {
long length = n;
while (length > 0) {
if (pos >= len) {
if ((len = readChunk()) == -1) {
return -1;
}
}
int c = (int) Math.min(n, len - pos);
pos += c;
length -= c;
}
return n;
}
@Override
public int available() {
return (len > 0 ? len - pos : 0);
}
@Override
public void close() throws IOException {
in.close();
}
/**
* Reads a single chunk from the underlying inputstream.
*
* @return
* @throws IOException
*/
private int readChunk() throws IOException {
pos = 0;
int w = readShort(in);
if (w == -1) {
return -1;
}
int chunkSize = (w & 0x0FFF) + 1; // plus 3 bytes minus 2 for the length
if ((w & 0x7000) != 0x3000) {
throw new IllegalArgumentException(String.format("Chunksize header A should be 0x3000, received 0x%04X", w & 0xE000));
}
boolean rawChunk = (w & 0x8000) == 0;
if (rawChunk) {
if (in.read(buf, 0, chunkSize) < chunkSize) {
throw new IllegalStateException(String.format("Not enough bytes read, expected %d", chunkSize));
}
return chunkSize;
} else {
int inOffset = 0;
int outOffset = 0;
while (inOffset < chunkSize) {
int tokenFlags = in.read();
inOffset++;
if (tokenFlags == -1) {
break;
}
for (int n = 0; n < 8; n++) {
if (inOffset >= chunkSize) {
break;
}
if ((tokenFlags & POWER2[n]) == 0) {
// literal
final int b = in.read();
if (b == -1) {
return -1;
}
buf[outOffset++] = (byte) b;
inOffset++;
} else {
// compressed token
int token = readShort(in);
if (token == -1) {
return -1;
}
inOffset += 2;
int copyLenBits = getCopyLenBits(outOffset - 1);
int copyOffset = (token >> (copyLenBits)) + 1;
int copyLen = (token & (POWER2[copyLenBits] - 1)) + 3;
int startPos = outOffset - copyOffset;
int endPos = startPos + copyLen;
for (int i = startPos; i < endPos; i++) {
buf[outOffset++] = buf[i];
}
}
}
}
return outOffset;
}
}
/**
* Helper method to determine how many bits in the CopyToken are used for the CopyLength.
*
* @param offset
* @return
*/
static int getCopyLenBits(int offset) {
for (int n = 11; n >= 4; n--) {
if ((offset & POWER2[n]) != 0) {
return 15 - n;
}
}
return 12;
}
/**
* Convenience method for read a 2-bytes short in little endian encoding.
*
* @return
* @throws IOException
*/
public int readShort() throws IOException {
return readShort(this);
}
/**
* Convenience method for read a 4-bytes int in little endian encoding.
*
* @return
* @throws IOException
*/
public int readInt() throws IOException {
return readInt(this);
}
private int readShort(InputStream stream) throws IOException {
int b0, b1;
if ((b0 = stream.read()) == -1) {
return -1;
}
if ((b1 = stream.read()) == -1) {
return -1;
}
return (b0 & 0xFF) | ((b1 & 0xFF) << 8);
}
private int readInt(InputStream stream) throws IOException {
int b0, b1, b2, b3;
if ((b0 = stream.read()) == -1) {
return -1;
}
if ((b1 = stream.read()) == -1) {
return -1;
}
if ((b2 = stream.read()) == -1) {
return -1;
}
if ((b3 = stream.read()) == -1) {
return -1;
}
return (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
}
}