Make the extractor exactly that, powered by the reader #52949

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1738429 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2016-04-10 12:59:38 +00:00
parent 2185e91883
commit e8e1a294e6

View File

@ -17,172 +17,73 @@
package org.apache.poi.poifs.macros; package org.apache.poi.poifs.macros;
import java.io.ByteArrayInputStream; import java.io.File;
import java.io.ByteArrayOutputStream; import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.OutputStreamWriter;
import java.io.PushbackInputStream;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.poi.poifs.eventfilesystem.POIFSReader; import org.apache.poi.util.StringUtil;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.RLEDecompressingInputStream;
/** /**
* This class is able to extract the source of all VBA Modules of an Excel file. * This class extracts out the source of all VBA Modules of an office file,
* both OOXML and OLE2/POIFS, eg XLSM or DOC
*/ */
public class VBAMacroExtractor { public class VBAMacroExtractor {
public static void main(String args[]) throws IOException {
if (args.length == 0) {
System.err.println("Use:");
System.err.println(" VBAMacroExtractor <office.doc> [output]");
System.err.println("");
System.err.println("If an output directory is given, macros are written there");
System.err.println("Otherwise they are output to the screen");
System.exit(1);
}
/** File input = new File(args[0]);
* Extract macros from XLSM or XLS file. Automatically detects ZIP (XLSM, DOCX, etc) files. File output = null;
* if (args.length > 1) {
* @param in output = new File(args[1]);
* @return
* @throws IOException
*/
public Map<String, String> extractMacros(InputStream in) throws IOException {
PushbackInputStream bpin = new PushbackInputStream(in, 2);
byte[] header = new byte[2];
if (bpin.read(header) != 2) {
throw new IllegalArgumentException("Invalid InputStream: cannot read 2 bytes");
}
bpin.unread(header);
if (header[0] == 'P' && header[1] == 'K') {
ZipInputStream zis = new ZipInputStream(bpin);
ZipEntry zipEntry;
while ((zipEntry = zis.getNextEntry()) != null) {
if ("xl/vbaProject.bin".equals(zipEntry.getName())) {
try {
return extractMacrosFromPOIFSInputStream(zis);
} finally {
zis.closeEntry();
}
}
}
return null;
} else {
return extractMacrosFromPOIFSInputStream(bpin);
} }
VBAMacroExtractor extract = new VBAMacroExtractor();
extract.extract(input, output);
} }
/** public void extract(File input, File outputDir) throws IOException {
* Extracts all macros from all modules of the provided input stream. The stream is assumed to be in POIFS format (i.e. XLS file itself or if (! input.exists()) throw new FileNotFoundException(input.toString());
* vbaProject.bin from OOXML files) System.err.print("Extracting VBA Macros from " + input + " to ");
* if (outputDir != null) {
* @param in if (! outputDir.exists()) outputDir.mkdir();
* @return System.err.println(outputDir);
* @throws IOException } else {
*/ System.err.println("STDOUT");
public Map<String, String> extractMacrosFromPOIFSInputStream(InputStream in) throws IOException {
class Module {
Integer offset;
byte[] buf;
} }
class ModuleMap extends HashMap<String, Module> {
Charset charset = Charset.forName("Cp1252"); // default charset VBAMacroReader reader = new VBAMacroReader(input);
} Map<String,String> macros = reader.readMacros();
try { reader.close();
final ModuleMap modules = new ModuleMap();
POIFSReader dirReader = new POIFSReader();
dirReader.registerListener(new POIFSReaderListener() {
public void processPOIFSReaderEvent(POIFSReaderEvent event) { final String divider = "---------------------------------------";
try { for (String macro : macros.keySet()) {
String name = event.getName(); if (outputDir == null) {
if (event.getPath().toString().endsWith("\\VBA")) { System.out.println(divider);
if ("dir".equals(name)) { System.out.println(macro);
// process DIR System.out.println("");
RLEDecompressingInputStream in = new RLEDecompressingInputStream(event.getStream()); System.out.println(macros.get(macro));
String streamName = null; } else {
while (true) { File out = new File(outputDir, macro + ".vba");
int id = in.readShort(); FileOutputStream fout = new FileOutputStream(out);
if (id == -1 || id == 0x0010) { OutputStreamWriter fwriter = new OutputStreamWriter(fout, StringUtil.UTF8);
break; // EOF or TERMINATOR fwriter.write(macros.get(macro));
} fwriter.close();
int len = in.readInt(); fout.close();
switch (id) { System.out.println("Extracted " + out);
case 0x0009: // PROJECTVERSION
in.skip(6);
break;
case 0x0003: // PROJECTCODEPAGE
int codepage = in.readShort();
modules.charset = Charset.forName("Cp" + codepage);
break;
case 0x001A: // STREAMNAME
byte[] streamNameBuf = new byte[len];
int count = in.read(streamNameBuf);
streamName = new String(streamNameBuf, 0, count, modules.charset);
break;
case 0x0031: // MODULEOFFSET
int moduleOffset = in.readInt();
Module module = modules.get(streamName);
if (module != null) {
ByteArrayOutputStream out = new ByteArrayOutputStream();
RLEDecompressingInputStream stream = new RLEDecompressingInputStream(new ByteArrayInputStream(
module.buf, moduleOffset, module.buf.length - moduleOffset));
IOUtils.copy(stream, out);
stream.close();
out.close();
module.buf = out.toByteArray();
} else {
module = new Module();
module.offset = moduleOffset;
modules.put(streamName, module);
}
break;
default:
in.skip(len);
break;
}
}
} else if (!name.startsWith("__SRP") && !name.startsWith("_VBA_PROJECT")) {
// process module, skip __SRP and _VBA_PROJECT since these do not contain macros
Module module = modules.get(name);
final DocumentInputStream stream = event.getStream();
final InputStream in;
if (module == null) {
// no DIR stream with offsets yet, so store the compressed bytes for later
module = new Module();
modules.put(name, module);
in = stream;
} else {
// we know the offset already, so decompress immediately on-the-fly
stream.skip(module.offset);
in = new RLEDecompressingInputStream(stream);
}
final ByteArrayOutputStream out = new ByteArrayOutputStream();
IOUtils.copy(in, out);
in.close();
out.close();
module.buf = out.toByteArray();
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
dirReader.read(in);
Map<String, String> moduleSources = new HashMap<String, String>();
for (Map.Entry<String, Module> entry : modules.entrySet()) {
Module module = entry.getValue();
if (module.buf != null && module.buf.length > 0) { // Skip empty modules
moduleSources.put(entry.getKey(), new String(module.buf, modules.charset));
}
} }
return moduleSources; }
} catch (IOException e) { if (outputDir == null) {
e.printStackTrace(); System.out.println(divider);
throw e;
} }
} }
} }