diff --git a/src/java/org/apache/poi/poifs/macros/VBAMacroReader.java b/src/java/org/apache/poi/poifs/macros/VBAMacroReader.java index bad012e29..921f749df 100644 --- a/src/java/org/apache/poi/poifs/macros/VBAMacroReader.java +++ b/src/java/org/apache/poi/poifs/macros/VBAMacroReader.java @@ -43,7 +43,9 @@ import org.apache.poi.poifs.filesystem.OfficeXmlFileException; import org.apache.poi.util.CodePageUtil; import org.apache.poi.util.HexDump; import org.apache.poi.util.IOUtils; +import org.apache.poi.util.LittleEndian; import org.apache.poi.util.RLEDecompressingInputStream; +import org.apache.poi.util.StringUtil; /** *

Finds all VBA Macros in an office file (OLE2/POIFS and OOXML/OPC), @@ -61,9 +63,7 @@ import org.apache.poi.util.RLEDecompressingInputStream; public class VBAMacroReader implements Closeable { protected static final String VBA_PROJECT_OOXML = "vbaProject.bin"; protected static final String VBA_PROJECT_POIFS = "VBA"; - // FIXME: When minimum supported version is Java 7, replace with java.nio.charset.StandardCharsets.UTF_16LE - private static final Charset UTF_16LE = Charset.forName("UTF-16LE"); - + private NPOIFSFileSystem fs; public VBAMacroReader(InputStream rstream) throws IOException { @@ -145,7 +145,7 @@ public class VBAMacroReader implements Closeable { } } protected static class ModuleMap extends HashMap { - Charset charset = Charset.forName("Cp1252"); // default charset + Charset charset = StringUtil.WIN_1252; // default charset } /** @@ -172,20 +172,7 @@ public class VBAMacroReader implements Closeable { } } - /** - * Read length bytes of MBCS (multi-byte character set) characters from the stream - * - * @param stream the inputstream to read from - * @param length number of bytes to read from stream - * @param charset the character set encoding of the bytes in the stream - * @return a java String in the supplied character set - * @throws IOException If reading from the stream fails - */ - private static String readString(InputStream stream, int length, Charset charset) throws IOException { - byte[] buffer = new byte[length]; - int count = stream.read(buffer); - return new String(buffer, 0, count, charset); - } + /** * reads module from DIR node in input stream and adds it to the modules map for decompression later @@ -199,7 +186,7 @@ public class VBAMacroReader implements Closeable { * @param modules a map to store the modules * @throws IOException If reading data from the stream or from modules fails */ - private static void readModule(RLEDecompressingInputStream in, String streamName, ModuleMap modules) throws IOException { + private static void readModuleMetadataFromDirStream(RLEDecompressingInputStream in, String streamName, ModuleMap modules) throws IOException { int moduleOffset = in.readInt(); Module module = modules.get(streamName); if (module == null) { @@ -218,27 +205,57 @@ public class VBAMacroReader implements Closeable { } } - private static void readModule(DocumentInputStream dis, String name, ModuleMap modules) throws IOException { + private static void readModuleFromDocumentStream(DocumentNode documentNode, String name, ModuleMap modules) throws IOException { Module module = modules.get(name); // TODO Refactor this to fetch dir then do the rest if (module == null) { // no DIR stream with offsets yet, so store the compressed bytes for later module = new Module(); modules.put(name, module); - module.read(dis); + InputStream dis = new DocumentInputStream(documentNode); + try { + module.read(dis); + } finally { + dis.close(); + } } else if (module.buf == null) { //if we haven't already read the bytes for the module keyed off this name... + if (module.offset == null) { //This should not happen. bug 59858 throw new IOException("Module offset for '" + name + "' was never read."); } - // we know the offset already, so decompress immediately on-the-fly - long skippedBytes = dis.skip(module.offset); - if (skippedBytes != module.offset) { - throw new IOException("tried to skip " + module.offset + " bytes, but actually skipped " + skippedBytes + " bytes"); + + //try the general case, where module.offset is accurate + InputStream decompressed = null; + InputStream compressed = new DocumentInputStream(documentNode); + try { + // we know the offset already, so decompress immediately on-the-fly + long skippedBytes = compressed.skip(module.offset); + if (skippedBytes != module.offset) { + throw new IOException("tried to skip " + module.offset + " bytes, but actually skipped " + skippedBytes + " bytes"); + } + decompressed = new RLEDecompressingInputStream(compressed); + module.read(decompressed); + return; + } catch (IllegalArgumentException e) { + } catch (IllegalStateException e) { + } finally { + IOUtils.closeQuietly(compressed); + IOUtils.closeQuietly(decompressed); + } + + //bad module.offset, try brute force + compressed = new DocumentInputStream(documentNode); + byte[] decompressedBytes = null; + try { + decompressedBytes = findCompressedStreamWBruteForce(compressed); + } finally { + IOUtils.closeQuietly(compressed); + } + + if (decompressedBytes != null) { + module.read(new ByteArrayInputStream(decompressedBytes)); } - InputStream stream = new RLEDecompressingInputStream(dis); - module.read(stream); - stream.close(); } } @@ -249,7 +266,7 @@ public class VBAMacroReader implements Closeable { * @throws IOException If skipping would exceed the available data or skipping did not work. */ private static void trySkip(InputStream in, long n) throws IOException { - long skippedBytes = in.skip(n); + long skippedBytes = IOUtils.skipFully(in, n); if (skippedBytes != n) { if (skippedBytes < 0) { throw new IOException( @@ -258,33 +275,18 @@ public class VBAMacroReader implements Closeable { } else { throw new IOException( "Tried skipping " + n + " bytes, but only " + skippedBytes + " bytes were skipped. " - + "This should never happen."); + + "This should never happen with a non-corrupt file."); } } } // Constants from MS-OVBA: https://msdn.microsoft.com/en-us/library/office/cc313094(v=office.12).aspx - private static final int EOF = -1; - private static final int VERSION_INDEPENDENT_TERMINATOR = 0x0010; - @SuppressWarnings("unused") - private static final int VERSION_DEPENDENT_TERMINATOR = 0x002B; - private static final int PROJECTVERSION = 0x0009; - private static final int PROJECTCODEPAGE = 0x0003; - private static final int STREAMNAME = 0x001A; - private static final int MODULEOFFSET = 0x0031; - @SuppressWarnings("unused") - private static final int MODULETYPE_PROCEDURAL = 0x0021; - @SuppressWarnings("unused") - private static final int MODULETYPE_DOCUMENT_CLASS_OR_DESIGNER = 0x0022; - @SuppressWarnings("unused") - private static final int PROJECTLCID = 0x0002; - @SuppressWarnings("unused") - private static final int MODULE_NAME = 0x0019; - @SuppressWarnings("unused") - private static final int MODULE_NAME_UNICODE = 0x0047; - @SuppressWarnings("unused") - private static final int MODULE_DOC_STRING = 0x001c; private static final int STREAMNAME_RESERVED = 0x0032; + private static final int PROJECT_CONSTANTS_RESERVED = 0x003C; + private static final int HELP_FILE_PATH_RESERVED = 0x003D; + private static final int REFERENCE_NAME_RESERVED = 0x003E; + private static final int DOC_STRING_RESERVED = 0x0040; + private static final int MODULE_DOCSTRING_RESERVED = 0x0048; /** * Reads VBA Project modules from a VBA Project directory located at @@ -293,76 +295,330 @@ public class VBAMacroReader implements Closeable { * @since 3.15-beta2 */ protected void readMacros(DirectoryNode macroDir, ModuleMap modules) throws IOException { + //bug59858 shows that dirstream may not be in this directory (\MBD00082648\_VBA_PROJECT_CUR\VBA ENTRY NAME) + //but may be in another directory (\_VBA_PROJECT_CUR\VBA ENTRY NAME) + //process the dirstream first -- "dir" is case insensitive + for (String entryName : macroDir.getEntryNames()) { + if ("dir".equalsIgnoreCase(entryName)) { + processDirStream(macroDir.getEntry(entryName), modules); + break; + } + } + for (Entry entry : macroDir) { if (! (entry instanceof DocumentNode)) { continue; } String name = entry.getName(); DocumentNode document = (DocumentNode)entry; - DocumentInputStream dis = new DocumentInputStream(document); - try { - if ("dir".equalsIgnoreCase(name)) { - // process DIR - RLEDecompressingInputStream in = new RLEDecompressingInputStream(dis); - String streamName = null; - int recordId = 0; - try { - while (true) { - recordId = in.readShort(); - if (EOF == recordId - || VERSION_INDEPENDENT_TERMINATOR == recordId) { - break; - } - int recordLength = in.readInt(); - switch (recordId) { - case PROJECTVERSION: - trySkip(in, 6); - break; - case PROJECTCODEPAGE: - int codepage = in.readShort(); - modules.charset = Charset.forName(CodePageUtil.codepageToEncoding(codepage, true)); - break; - case STREAMNAME: - streamName = readString(in, recordLength, modules.charset); - int reserved = in.readShort(); - if (reserved != STREAMNAME_RESERVED) { - throw new IOException("Expected x0032 after stream name before Unicode stream name, but found: "+ - Integer.toHexString(reserved)); - } - int unicodeNameRecordLength = in.readInt(); - readUnicodeString(in, unicodeNameRecordLength); - // do something with this at some point - break; - case MODULEOFFSET: - readModule(in, streamName, modules); - break; - default: - trySkip(in, recordLength); - break; - } - } - } catch (final IOException e) { - throw new IOException( - "Error occurred while reading macros at section id " - + recordId + " (" + HexDump.shortToHex(recordId) + ")", e); - } - finally { - in.close(); - } - } else if (!startsWithIgnoreCase(name, "__SRP") + + if (! "dir".equalsIgnoreCase(name) && !startsWithIgnoreCase(name, "__SRP") && !startsWithIgnoreCase(name, "_VBA_PROJECT")) { // process module, skip __SRP and _VBA_PROJECT since these do not contain macros - readModule(dis, name, modules); - } - } - finally { - dis.close(); + readModuleFromDocumentStream(document, name, modules); } } } + private enum RecordType { + // Constants from MS-OVBA: https://msdn.microsoft.com/en-us/library/office/cc313094(v=office.12).aspx + MODULE_OFFSET(0x0031), + PROJECT_SYS_KIND(0x01), + PROJECT_LCID(0x0002), + PROJECT_LCID_INVOKE(0x14), + PROJECT_CODEPAGE(0x0003), + PROJECT_NAME(0x04), + PROJECT_DOC_STRING(0x05), + PROJECT_HELP_FILE_PATH(0x06), + PROJECT_HELP_CONTEXT(0x07, 8), + PROJECT_LIB_FLAGS(0x08), + PROJECT_VERSION(0x09, 10), + PROJECT_CONSTANTS(0x0C), + PROJECT_MODULES(0x0F), + DIR_STREAM_TERMINATOR(0x10), + PROJECT_COOKIE(0x13), + MODULE_NAME(0x19), + MODULE_NAME_UNICODE(0x47), + MODULE_STREAM_NAME(0x1A), + MODULE_DOC_STRING(0x1C), + MODULE_HELP_CONTEXT(0x1E), + MODULE_COOKIE(0x2c), + MODULE_TYPE_PROCEDURAL(0x21, 4), + MODULE_TYPE_OTHER(0x22, 4), + MODULE_PRIVATE(0x28, 4), + REFERENCE_NAME(0x16), + REFERENCE_REGISTERED(0x0D), + REFERENCE_PROJECT(0x0E), + REFERENCE_CONTROL_A(0x2F), + + //according to the spec, REFERENCE_CONTROL_B(0x33) should have the + //same structure as REFERENCE_CONTROL_A(0x2F). + //However, it seems to have the int(length) record structure that most others do. + //See 59830.xls for this record. + REFERENCE_CONTROL_B(0x33), + //REFERENCE_ORIGINAL(0x33), + + + MODULE_TERMINATOR(0x002B), + EOF(-1), + UNKNOWN(-2); + + + private final int VARIABLE_LENGTH = -1; + private final int id; + private final int constantLength; + + RecordType(int id) { + this.id = id; + this.constantLength = VARIABLE_LENGTH; + } + + RecordType(int id, int constantLength) { + this.id = id; + this.constantLength = constantLength; + } + + int getConstantLength() { + return constantLength; + } + + static RecordType lookup(int id) { + for (RecordType type : RecordType.values()) { + if (type.id == id) { + return type; + } + } + return UNKNOWN; + } + } + + + private enum DIR_STATE { + INFORMATION_RECORD, + REFERENCES_RECORD, + MODULES_RECORD + } + + private static class ASCIIUnicodeStringPair { + private final String ascii; + private final String unicode; + + ASCIIUnicodeStringPair(String ascii, String unicode) { + this.ascii = ascii; + this.unicode = unicode; + } + + private String getAscii() { + return ascii; + } + + private String getUnicode() { + return unicode; + } + } + + private void processDirStream(Entry dir, ModuleMap modules) throws IOException { + DocumentNode dirDocumentNode = (DocumentNode)dir; + DocumentInputStream dis = new DocumentInputStream(dirDocumentNode); + DIR_STATE dirState = DIR_STATE.INFORMATION_RECORD; + try { + RLEDecompressingInputStream in = new RLEDecompressingInputStream(dis); + String streamName = null; + int recordId = 0; + boolean inReferenceTwiddled = false; + try { + while (true) { + recordId = in.readShort(); + if (recordId == -1) { + break; + } + RecordType type = RecordType.lookup(recordId); + + if (type.equals(RecordType.EOF) || type.equals(RecordType.DIR_STREAM_TERMINATOR)) { + break; + } + switch (type) { + case PROJECT_VERSION: + trySkip(in, RecordType.PROJECT_VERSION.getConstantLength()); + break; + case PROJECT_CODEPAGE: + in.readInt();//record size must == 4 + int codepage = in.readShort(); + modules.charset = Charset.forName(CodePageUtil.codepageToEncoding(codepage, true)); + break; + case MODULE_STREAM_NAME: + ASCIIUnicodeStringPair pair = readStringPair(in, modules.charset, STREAMNAME_RESERVED); + streamName = pair.getAscii(); + break; + case PROJECT_DOC_STRING: + readStringPair(in, modules.charset, DOC_STRING_RESERVED); + break; + case PROJECT_HELP_FILE_PATH: + readStringPair(in, modules.charset, HELP_FILE_PATH_RESERVED); + break; + case PROJECT_CONSTANTS: + readStringPair(in, modules.charset, PROJECT_CONSTANTS_RESERVED); + break; + case REFERENCE_NAME: + if (dirState.equals(DIR_STATE.INFORMATION_RECORD)) { + dirState = DIR_STATE.REFERENCES_RECORD; + } + readStringPair(in, modules.charset, REFERENCE_NAME_RESERVED); + break; + case MODULE_DOC_STRING : + int modDocStringLength = in.readInt(); + readString(in, modDocStringLength, modules.charset); + int modDocStringReserved = in.readShort(); + if (modDocStringReserved != MODULE_DOCSTRING_RESERVED) { + throw new IOException("Expected x003C after stream name before Unicode stream name, but found: " + + Integer.toHexString(modDocStringReserved)); + } + int unicodeModDocStringLength = in.readInt(); + readUnicodeString(in, unicodeModDocStringLength); + // do something with this at some point + break; + case MODULE_OFFSET: + int modOffsetSz = in.readInt(); + //should be 4 + readModuleMetadataFromDirStream(in, streamName, modules); + break; + case PROJECT_MODULES: + dirState = DIR_STATE.MODULES_RECORD; + in.readInt();//size must == 2 + in.readShort();//number of modules + break; + case REFERENCE_CONTROL_A: + int szTwiddled = in.readInt(); + trySkip(in, szTwiddled); + int nextRecord = in.readShort(); + //reference name is optional! + if (nextRecord == RecordType.REFERENCE_NAME.id) { + readStringPair(in, modules.charset, REFERENCE_NAME_RESERVED); + nextRecord = in.readShort(); + } + if (nextRecord != 0x30) { + throw new IOException("Expected 0x30 as Reserved3 in a ReferenceControl record"); + } + int szExtended = in.readInt(); + trySkip(in, szExtended); + break; + case MODULE_TERMINATOR: + int endOfModulesReserved = in.readInt(); + //must be 0; + break; + default: + if (type.getConstantLength() > -1) { + trySkip(in, type.getConstantLength()); + } else { + int recordLength = in.readInt(); + trySkip(in, recordLength); + } + break; + } + } + } catch (final IOException e) { + throw new IOException( + "Error occurred while reading macros at section id " + + recordId + " (" + HexDump.shortToHex(recordId) + ")", e); + } finally { + in.close(); + } + } finally { + dis.close(); + } + } + + private ASCIIUnicodeStringPair readStringPair(RLEDecompressingInputStream in, Charset charset, int reservedByte) throws IOException { + int nameLength = in.readInt(); + String ascii = readString(in, nameLength, charset); + int reserved = in.readShort(); + if (reserved != reservedByte) { + throw new IOException("Expected "+Integer.toHexString(reservedByte)+ "after name before Unicode name, but found: " + + Integer.toHexString(reserved)); + } + int unicodeNameRecordLength = in.readInt(); + String unicode = readUnicodeString(in, unicodeNameRecordLength); + return new ASCIIUnicodeStringPair(ascii, unicode); + } + + + /** + * Read length bytes of MBCS (multi-byte character set) characters from the stream + * + * @param stream the inputstream to read from + * @param length number of bytes to read from stream + * @param charset the character set encoding of the bytes in the stream + * @return a java String in the supplied character set + * @throws IOException If reading from the stream fails + */ + private static String readString(InputStream stream, int length, Charset charset) throws IOException { + byte[] buffer = IOUtils.safelyAllocate(length, 20000); + int bytesRead = IOUtils.readFully(stream, buffer); + if (bytesRead != length) { + throw new IOException("Tried to read: "+length + + ", but could only read: "+bytesRead); + } + return new String(buffer, 0, length, charset); + } + private String readUnicodeString(RLEDecompressingInputStream in, int unicodeNameRecordLength) throws IOException { - byte[] buffer = new byte[unicodeNameRecordLength]; - IOUtils.readFully(in, buffer); - return new String(buffer, UTF_16LE); + byte[] buffer = IOUtils.safelyAllocate(unicodeNameRecordLength, 20000); + int bytesRead = IOUtils.readFully(in, buffer); + if (bytesRead != unicodeNameRecordLength) { + + } + return new String(buffer, StringUtil.UTF16LE); + } + + /** + * Sometimes the offset record in the dirstream is incorrect, but the macro can still be found. + * This will try to find the the first RLEDecompressing stream that starts with "Attribute". + * This relies on some, er, heuristics, admittedly. + * + * @param is full module inputstream to read + * @return uncompressed bytes if found, null otherwise + * @throws IOException for a true IOException copying the is to a byte array + */ + private static byte[] findCompressedStreamWBruteForce(InputStream is) throws IOException { + //buffer to memory for multiple tries + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + IOUtils.copy(is, bos); + byte[] compressed = bos.toByteArray(); + byte[] decompressed = null; + for (int i = 0; i < compressed.length; i++) { + if (compressed[i] == 0x01 && i < compressed.length-1) { + int w = LittleEndian.getUShort(compressed, i+1); + if (w <= 0 || (w & 0x7000) != 0x3000) { + continue; + } + decompressed = tryToDecompress(new ByteArrayInputStream(compressed, i, compressed.length - i)); + if (decompressed != null) { + if (decompressed.length > 9) { + //this is a complete hack. The challenge is that there + //can be many 0 length or junk streams that are uncompressed + //look in the first 20 characters for "Attribute" + int firstX = Math.min(20, decompressed.length); + String start = new String(decompressed, 0, firstX, StringUtil.WIN_1252); + if (start.contains("Attribute")) { + return decompressed; + } + } + } + } + } + return decompressed; + } + + private static byte[] tryToDecompress(InputStream is) { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + try { + IOUtils.copy(new RLEDecompressingInputStream(is), bos); + } catch (IllegalArgumentException e){ + return null; + } catch (IllegalStateException e) { + return null; + } catch (IOException e) { + return null; + } + return bos.toByteArray(); } } diff --git a/src/java/org/apache/poi/util/RLEDecompressingInputStream.java b/src/java/org/apache/poi/util/RLEDecompressingInputStream.java index 471fdab2c..7cc68483c 100644 --- a/src/java/org/apache/poi/util/RLEDecompressingInputStream.java +++ b/src/java/org/apache/poi/util/RLEDecompressingInputStream.java @@ -156,7 +156,7 @@ public class RLEDecompressingInputStream extends InputStream { private int readChunk() throws IOException { pos = 0; int w = readShort(in); - if (w == -1) { + if (w == -1 || w == 0) { return -1; } int chunkSize = (w & 0x0FFF) + 1; // plus 3 bytes minus 2 for the length diff --git a/src/testcases/org/apache/poi/poifs/macros/TestVBAMacroReader.java b/src/testcases/org/apache/poi/poifs/macros/TestVBAMacroReader.java index eeb43b4d2..12e2f16f0 100644 --- a/src/testcases/org/apache/poi/poifs/macros/TestVBAMacroReader.java +++ b/src/testcases/org/apache/poi/poifs/macros/TestVBAMacroReader.java @@ -33,6 +33,7 @@ import java.util.HashMap; import java.util.Map; import static org.apache.poi.POITestCase.assertContains; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; @@ -251,6 +252,7 @@ public class TestVBAMacroReader { File f = POIDataSamples.getSpreadSheetInstance().getFile("59830.xls"); VBAMacroReader r = new VBAMacroReader(f); Map macros = r.readMacros(); + assertEquals(29, macros.size()); assertNotNull(macros.get("Module20")); assertContains(macros.get("Module20"), "here start of superscripting"); r.close(); @@ -261,6 +263,7 @@ public class TestVBAMacroReader { File f = POIDataSamples.getSpreadSheetInstance().getFile("59858.xls"); VBAMacroReader r = new VBAMacroReader(f); Map macros = r.readMacros(); + assertEquals(11, macros.size()); assertNotNull(macros.get("Sheet4")); assertContains(macros.get("Sheet4"), "intentional constituent"); r.close(); @@ -271,6 +274,7 @@ public class TestVBAMacroReader { File f = POIDataSamples.getDocumentInstance().getFile("60158.docm"); VBAMacroReader r = new VBAMacroReader(f); Map macros = r.readMacros(); + assertEquals(2, macros.size()); assertNotNull(macros.get("NewMacros")); assertContains(macros.get("NewMacros"), "' dirty"); r.close(); @@ -282,8 +286,24 @@ public class TestVBAMacroReader { File f = POIDataSamples.getSpreadSheetInstance().getFile("60273.xls"); VBAMacroReader r = new VBAMacroReader(f); Map macros = r.readMacros(); + assertEquals(2, macros.size()); assertNotNull(macros.get("Module1")); assertContains(macros.get("Module1"), "9/8/2004"); r.close(); } + + + @Test + public void bug60279() throws IOException { + File f = POIDataSamples.getDocumentInstance().getFile("60279.doc"); + VBAMacroReader r = new VBAMacroReader(f); + Map macros = r.readMacros(); + assertEquals(1, macros.size()); + String content = macros.get("ThisDocument"); + assertContains(content, "Attribute VB_Base = \"1Normal.ThisDocument\""); + assertContains(content, "Attribute VB_Customizable = True"); + r.close(); + } + + } diff --git a/test-data/document/60279.doc b/test-data/document/60279.doc new file mode 100644 index 000000000..bf8643380 Binary files /dev/null and b/test-data/document/60279.doc differ