1049 lines
37 KiB
Java
1049 lines
37 KiB
Java
/* ====================================================================
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
this work for additional information regarding copyright ownership.
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
(the "License"); you may not use this file except in compliance with
|
|
the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==================================================================== */
|
|
package org.apache.poi.extractor;
|
|
|
|
import static org.apache.poi.POITestCase.assertContains;
|
|
import static org.junit.Assert.assertEquals;
|
|
import static org.junit.Assert.assertFalse;
|
|
import static org.junit.Assert.assertNotNull;
|
|
import static org.junit.Assert.assertNull;
|
|
import static org.junit.Assert.assertTrue;
|
|
import static org.junit.Assert.fail;
|
|
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.IOException;
|
|
|
|
import org.apache.poi.POIDataSamples;
|
|
import org.apache.poi.POIOLE2TextExtractor;
|
|
import org.apache.poi.POITextExtractor;
|
|
import org.apache.poi.POIXMLException;
|
|
import org.apache.poi.POIXMLTextExtractor;
|
|
import org.apache.poi.UnsupportedFileFormatException;
|
|
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
|
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
|
|
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
|
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
|
import org.apache.poi.hssf.HSSFTestDataSamples;
|
|
import org.apache.poi.hssf.OldExcelFormatException;
|
|
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
|
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
|
import org.apache.poi.openxml4j.opc.OPCPackage;
|
|
import org.apache.poi.openxml4j.opc.PackageAccess;
|
|
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|
import org.apache.poi.util.IOUtils;
|
|
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
|
|
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
|
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
|
|
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
|
|
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
|
import org.junit.BeforeClass;
|
|
import org.junit.Test;
|
|
|
|
/**
|
|
* Test that the extractor factory plays nicely
|
|
*/
|
|
public class TestExtractorFactory {
|
|
private static File txt;
|
|
|
|
private static File xls;
|
|
private static File xlsx;
|
|
private static File xlsxStrict;
|
|
private static File xltx;
|
|
private static File xlsEmb;
|
|
private static File xlsb;
|
|
|
|
private static File doc;
|
|
private static File doc6;
|
|
private static File doc95;
|
|
private static File docx;
|
|
private static File dotx;
|
|
private static File docEmb;
|
|
private static File docEmbOOXML;
|
|
|
|
private static File ppt;
|
|
private static File pptx;
|
|
|
|
private static File msg;
|
|
private static File msgEmb;
|
|
private static File msgEmbMsg;
|
|
|
|
private static File vsd;
|
|
private static File vsdx;
|
|
|
|
private static File pub;
|
|
|
|
private static File getFileAndCheck(POIDataSamples samples, String name) {
|
|
File file = samples.getFile(name);
|
|
|
|
assertNotNull("Did not get a file for " + name, file);
|
|
assertTrue("Did not get a type file for " + name, file.isFile());
|
|
assertTrue("File did not exist: " + name, file.exists());
|
|
|
|
return file;
|
|
}
|
|
|
|
@BeforeClass
|
|
public static void setUp() throws Exception {
|
|
|
|
POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
|
|
xls = getFileAndCheck(ssTests, "SampleSS.xls");
|
|
xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
|
|
xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
|
|
xltx = getFileAndCheck(ssTests, "test.xltx");
|
|
xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
|
|
xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
|
|
|
|
POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
|
|
doc = getFileAndCheck(wpTests, "SampleDoc.doc");
|
|
doc6 = getFileAndCheck(wpTests, "Word6.doc");
|
|
doc95 = getFileAndCheck(wpTests, "Word95.doc");
|
|
docx = getFileAndCheck(wpTests, "SampleDoc.docx");
|
|
dotx = getFileAndCheck(wpTests, "test.dotx");
|
|
docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
|
|
docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
|
|
|
|
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
|
ppt = getFileAndCheck(slTests, "SampleShow.ppt");
|
|
pptx = getFileAndCheck(slTests, "SampleShow.pptx");
|
|
txt = getFileAndCheck(slTests, "SampleShow.txt");
|
|
|
|
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
|
|
vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
|
|
vsdx = getFileAndCheck(dgTests, "test.vsdx");
|
|
|
|
POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
|
|
pub = getFileAndCheck(pubTests, "Simple.pub");
|
|
|
|
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
|
|
msg = getFileAndCheck(olTests, "quick.msg");
|
|
msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
|
|
msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
|
|
}
|
|
|
|
@Test
|
|
public void testFile() throws Exception {
|
|
// Excel
|
|
POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
|
|
assertNotNull("Had empty extractor for " + xls, xlsExtractor);
|
|
assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
|
|
xlsExtractor
|
|
instanceof ExcelExtractor
|
|
);
|
|
assertTrue(
|
|
xlsExtractor.getText().length() > 200
|
|
);
|
|
xlsExtractor.close();
|
|
|
|
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
|
|
assertTrue(
|
|
extractor.getClass().getName(),
|
|
extractor
|
|
instanceof XSSFExcelExtractor
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(xlsx);
|
|
assertTrue(
|
|
extractor.getText().length() > 200
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(xltx);
|
|
assertTrue(
|
|
extractor.getClass().getName(),
|
|
extractor
|
|
instanceof XSSFExcelExtractor
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(xlsb);
|
|
assertTrue(
|
|
extractor.getText().contains("test")
|
|
);
|
|
extractor.close();
|
|
|
|
|
|
extractor = ExtractorFactory.createExtractor(xltx);
|
|
assertTrue(
|
|
extractor.getText().contains("test")
|
|
);
|
|
extractor.close();
|
|
|
|
// TODO Support OOXML-Strict, see bug #57699
|
|
try {
|
|
/*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
|
|
fail("OOXML-Strict isn't yet supported");
|
|
} catch (POIXMLException e) {
|
|
// Expected, for now
|
|
}
|
|
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
|
|
// assertTrue(
|
|
// extractor
|
|
// instanceof XSSFExcelExtractor
|
|
// );
|
|
// extractor.close();
|
|
//
|
|
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
|
|
// assertTrue(
|
|
// extractor.getText().contains("test")
|
|
// );
|
|
// extractor.close();
|
|
|
|
|
|
// Word
|
|
extractor = ExtractorFactory.createExtractor(doc);
|
|
assertTrue(
|
|
extractor
|
|
instanceof WordExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 120
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(doc6);
|
|
assertTrue(
|
|
extractor
|
|
instanceof Word6Extractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 20
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(doc95);
|
|
assertTrue(
|
|
extractor
|
|
instanceof Word6Extractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 120
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(docx);
|
|
assertTrue(
|
|
extractor instanceof XWPFWordExtractor
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(docx);
|
|
assertTrue(
|
|
extractor.getText().length() > 120
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(dotx);
|
|
assertTrue(
|
|
extractor instanceof XWPFWordExtractor
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(dotx);
|
|
assertTrue(
|
|
extractor.getText().contains("Test")
|
|
);
|
|
extractor.close();
|
|
|
|
// PowerPoint (PPT)
|
|
extractor = ExtractorFactory.createExtractor(ppt);
|
|
assertTrue(
|
|
extractor
|
|
instanceof PowerPointExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 120
|
|
);
|
|
extractor.close();
|
|
|
|
// PowerPoint (PPTX)
|
|
extractor = ExtractorFactory.createExtractor(pptx);
|
|
assertTrue(
|
|
extractor
|
|
instanceof XSLFPowerPointExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 120
|
|
);
|
|
extractor.close();
|
|
|
|
// Visio - binary
|
|
extractor = ExtractorFactory.createExtractor(vsd);
|
|
assertTrue(
|
|
extractor
|
|
instanceof VisioTextExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 50
|
|
);
|
|
extractor.close();
|
|
|
|
// Visio - vsdx
|
|
extractor = ExtractorFactory.createExtractor(vsdx);
|
|
assertTrue(
|
|
extractor
|
|
instanceof XDGFVisioExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 20
|
|
);
|
|
extractor.close();
|
|
|
|
// Publisher
|
|
extractor = ExtractorFactory.createExtractor(pub);
|
|
assertTrue(
|
|
extractor
|
|
instanceof PublisherTextExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 50
|
|
);
|
|
extractor.close();
|
|
|
|
// Outlook msg
|
|
extractor = ExtractorFactory.createExtractor(msg);
|
|
assertTrue(
|
|
extractor
|
|
instanceof OutlookTextExtactor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 50
|
|
);
|
|
extractor.close();
|
|
|
|
// Text
|
|
try {
|
|
ExtractorFactory.createExtractor(txt);
|
|
fail();
|
|
} catch(IllegalArgumentException e) {
|
|
// Good
|
|
}
|
|
}
|
|
|
|
@Test
|
|
public void testInputStream() throws Exception {
|
|
// Excel
|
|
POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
|
|
assertTrue(
|
|
extractor
|
|
instanceof ExcelExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 200
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
|
|
assertTrue(
|
|
extractor.getClass().getName(),
|
|
extractor
|
|
instanceof XSSFExcelExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 200
|
|
);
|
|
// TODO Support OOXML-Strict, see bug #57699
|
|
// assertTrue(
|
|
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
|
|
// instanceof XSSFExcelExtractor
|
|
// );
|
|
// assertTrue(
|
|
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
|
|
// );
|
|
extractor.close();
|
|
|
|
// Word
|
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
|
|
assertTrue(
|
|
extractor.getClass().getName(),
|
|
extractor
|
|
instanceof WordExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 120
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
|
|
assertTrue(
|
|
extractor.getClass().getName(),
|
|
extractor
|
|
instanceof Word6Extractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 20
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
|
|
assertTrue(
|
|
extractor.getClass().getName(),
|
|
extractor
|
|
instanceof Word6Extractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 120
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
|
|
assertTrue(
|
|
extractor
|
|
instanceof XWPFWordExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 120
|
|
);
|
|
extractor.close();
|
|
|
|
// PowerPoint
|
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
|
|
assertTrue(
|
|
extractor
|
|
instanceof PowerPointExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 120
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
|
|
assertTrue(
|
|
extractor
|
|
instanceof XSLFPowerPointExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 120
|
|
);
|
|
extractor.close();
|
|
|
|
// Visio
|
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
|
|
assertTrue(
|
|
extractor
|
|
instanceof VisioTextExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 50
|
|
);
|
|
extractor.close();
|
|
|
|
// Visio - vsdx
|
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
|
|
assertTrue(
|
|
extractor
|
|
instanceof XDGFVisioExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 20
|
|
);
|
|
extractor.close();
|
|
|
|
// Publisher
|
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
|
|
assertTrue(
|
|
extractor
|
|
instanceof PublisherTextExtractor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 50
|
|
);
|
|
extractor.close();
|
|
|
|
// Outlook msg
|
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
|
|
assertTrue(
|
|
extractor
|
|
instanceof OutlookTextExtactor
|
|
);
|
|
assertTrue(
|
|
extractor.getText().length() > 50
|
|
);
|
|
extractor.close();
|
|
|
|
// Text
|
|
try {
|
|
FileInputStream stream = new FileInputStream(txt);
|
|
try {
|
|
ExtractorFactory.createExtractor(stream);
|
|
fail();
|
|
} finally {
|
|
IOUtils.closeQuietly(stream);
|
|
}
|
|
} catch(IllegalArgumentException e) {
|
|
// Good
|
|
}
|
|
}
|
|
|
|
@Test
|
|
public void testPOIFS() throws Exception {
|
|
// Excel
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
|
|
instanceof ExcelExtractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
|
|
);
|
|
|
|
// Word
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
|
|
instanceof WordExtractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
|
|
);
|
|
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
|
|
instanceof Word6Extractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
|
|
);
|
|
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
|
|
instanceof Word6Extractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
|
|
);
|
|
|
|
// PowerPoint
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
|
|
instanceof PowerPointExtractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
|
|
);
|
|
|
|
// Visio
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
|
|
instanceof VisioTextExtractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
|
|
);
|
|
|
|
// Publisher
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
|
|
instanceof PublisherTextExtractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
|
|
);
|
|
|
|
// Outlook msg
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
|
|
instanceof OutlookTextExtactor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
|
|
);
|
|
|
|
// Text
|
|
try {
|
|
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
|
|
fail();
|
|
} catch(IOException e) {
|
|
// Good
|
|
}
|
|
}
|
|
|
|
|
|
@Test
|
|
public void testOPOIFS() throws Exception {
|
|
// Excel
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
|
|
instanceof ExcelExtractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
|
|
);
|
|
|
|
// Word
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
|
|
instanceof WordExtractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
|
|
);
|
|
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
|
|
instanceof Word6Extractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
|
|
);
|
|
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
|
|
instanceof Word6Extractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
|
|
);
|
|
|
|
// PowerPoint
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
|
|
instanceof PowerPointExtractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
|
|
);
|
|
|
|
// Visio
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
|
|
instanceof VisioTextExtractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
|
|
);
|
|
|
|
// Publisher
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
|
|
instanceof PublisherTextExtractor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
|
|
);
|
|
|
|
// Outlook msg
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
|
|
instanceof OutlookTextExtactor
|
|
);
|
|
assertTrue(
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
|
|
);
|
|
|
|
// Text
|
|
try {
|
|
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
|
|
fail();
|
|
} catch(IOException e) {
|
|
// Good
|
|
}
|
|
}
|
|
|
|
@Test
|
|
public void testPackage() throws Exception {
|
|
// Excel
|
|
POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
|
|
assertTrue(extractor instanceof XSSFExcelExtractor);
|
|
extractor.close();
|
|
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
|
|
assertTrue(extractor.getText().length() > 200);
|
|
extractor.close();
|
|
|
|
// Word
|
|
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
|
|
assertTrue(extractor instanceof XWPFWordExtractor);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
|
|
assertTrue(extractor.getText().length() > 120);
|
|
extractor.close();
|
|
|
|
// PowerPoint
|
|
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
|
|
assertTrue(extractor instanceof XSLFPowerPointExtractor);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
|
|
assertTrue(extractor.getText().length() > 120);
|
|
extractor.close();
|
|
|
|
// Visio
|
|
extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
|
|
assertTrue(extractor instanceof XDGFVisioExtractor);
|
|
assertTrue(extractor.getText().length() > 20);
|
|
extractor.close();
|
|
|
|
// Text
|
|
try {
|
|
ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
|
|
fail("TestExtractorFactory.testPackage() failed on " + txt);
|
|
} catch(UnsupportedFileFormatException e) {
|
|
// Good
|
|
} catch (Exception e) {
|
|
System.out.println("TestExtractorFactory.testPackage() failed on " + txt);
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
@Test
|
|
public void testPreferEventBased() throws Exception {
|
|
assertFalse(ExtractorFactory.getPreferEventExtractor());
|
|
assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
|
|
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
|
|
|
|
ExtractorFactory.setThreadPrefersEventExtractors(true);
|
|
|
|
assertTrue(ExtractorFactory.getPreferEventExtractor());
|
|
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
|
|
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
|
|
|
|
ExtractorFactory.setAllThreadsPreferEventExtractors(false);
|
|
|
|
assertFalse(ExtractorFactory.getPreferEventExtractor());
|
|
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
|
|
assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
|
|
|
|
ExtractorFactory.setAllThreadsPreferEventExtractors(null);
|
|
|
|
assertTrue(ExtractorFactory.getPreferEventExtractor());
|
|
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
|
|
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
|
|
|
|
|
|
// Check we get the right extractors now
|
|
POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
|
|
assertTrue(
|
|
extractor
|
|
instanceof EventBasedExcelExtractor
|
|
);
|
|
extractor.close();
|
|
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
|
|
assertTrue(
|
|
extractor.getText().length() > 200
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
|
|
assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
|
|
assertTrue(
|
|
extractor.getText().length() > 200
|
|
);
|
|
extractor.close();
|
|
|
|
|
|
// Put back to normal
|
|
ExtractorFactory.setThreadPrefersEventExtractors(false);
|
|
assertFalse(ExtractorFactory.getPreferEventExtractor());
|
|
assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
|
|
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
|
|
|
|
// And back
|
|
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
|
|
assertTrue(
|
|
extractor
|
|
instanceof ExcelExtractor
|
|
);
|
|
extractor.close();
|
|
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
|
|
assertTrue(
|
|
extractor.getText().length() > 200
|
|
);
|
|
extractor.close();
|
|
|
|
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
|
|
assertTrue(
|
|
extractor
|
|
instanceof XSSFExcelExtractor
|
|
);
|
|
extractor.close();
|
|
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
|
|
assertTrue(
|
|
extractor.getText().length() > 200
|
|
);
|
|
extractor.close();
|
|
}
|
|
|
|
/**
|
|
* Test embeded docs text extraction. For now, only
|
|
* does poifs embeded, but will do ooxml ones
|
|
* at some point.
|
|
*/
|
|
@Test
|
|
public void testEmbeded() throws Exception {
|
|
POIOLE2TextExtractor ext;
|
|
POITextExtractor[] embeds;
|
|
|
|
// No embedings
|
|
ext = (POIOLE2TextExtractor)
|
|
ExtractorFactory.createExtractor(xls);
|
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
|
assertEquals(0, embeds.length);
|
|
ext.close();
|
|
|
|
// Excel
|
|
ext = (POIOLE2TextExtractor)
|
|
ExtractorFactory.createExtractor(xlsEmb);
|
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
|
|
|
assertEquals(6, embeds.length);
|
|
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
|
|
for (POITextExtractor embed : embeds) {
|
|
assertTrue(embed.getText().length() > 20);
|
|
|
|
if (embed instanceof PowerPointExtractor) numPpt++;
|
|
else if (embed instanceof ExcelExtractor) numXls++;
|
|
else if (embed instanceof WordExtractor) numWord++;
|
|
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
|
}
|
|
assertEquals(2, numPpt);
|
|
assertEquals(2, numXls);
|
|
assertEquals(2, numWord);
|
|
assertEquals(0, numMsg);
|
|
ext.close();
|
|
|
|
// Word
|
|
ext = (POIOLE2TextExtractor)
|
|
ExtractorFactory.createExtractor(docEmb);
|
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
|
|
|
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
|
|
assertEquals(4, embeds.length);
|
|
for (POITextExtractor embed : embeds) {
|
|
assertTrue(embed.getText().length() > 20);
|
|
if (embed instanceof PowerPointExtractor) numPpt++;
|
|
else if (embed instanceof ExcelExtractor) numXls++;
|
|
else if (embed instanceof WordExtractor) numWord++;
|
|
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
|
}
|
|
assertEquals(1, numPpt);
|
|
assertEquals(2, numXls);
|
|
assertEquals(1, numWord);
|
|
assertEquals(0, numMsg);
|
|
ext.close();
|
|
|
|
// Word which contains an OOXML file
|
|
ext = (POIOLE2TextExtractor)
|
|
ExtractorFactory.createExtractor(docEmbOOXML);
|
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
|
|
|
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
|
|
assertEquals(3, embeds.length);
|
|
for (POITextExtractor embed : embeds) {
|
|
assertTrue(embed.getText().length() > 20);
|
|
if (embed instanceof PowerPointExtractor) numPpt++;
|
|
else if (embed instanceof ExcelExtractor) numXls++;
|
|
else if (embed instanceof WordExtractor) numWord++;
|
|
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
|
else if (embed instanceof XWPFWordExtractor) numWordX++;
|
|
}
|
|
assertEquals(1, numPpt);
|
|
assertEquals(1, numXls);
|
|
assertEquals(0, numWord);
|
|
assertEquals(1, numWordX);
|
|
assertEquals(0, numMsg);
|
|
ext.close();
|
|
|
|
// Outlook
|
|
ext = (OutlookTextExtactor)
|
|
ExtractorFactory.createExtractor(msgEmb);
|
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
|
|
|
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
|
|
assertEquals(1, embeds.length);
|
|
for (POITextExtractor embed : embeds) {
|
|
assertTrue(embed.getText().length() > 20);
|
|
if (embed instanceof PowerPointExtractor) numPpt++;
|
|
else if (embed instanceof ExcelExtractor) numXls++;
|
|
else if (embed instanceof WordExtractor) numWord++;
|
|
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
|
}
|
|
assertEquals(0, numPpt);
|
|
assertEquals(0, numXls);
|
|
assertEquals(1, numWord);
|
|
assertEquals(0, numMsg);
|
|
ext.close();
|
|
|
|
// Outlook with another outlook file in it
|
|
ext = (OutlookTextExtactor)
|
|
ExtractorFactory.createExtractor(msgEmbMsg);
|
|
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
|
|
|
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
|
|
assertEquals(1, embeds.length);
|
|
for (POITextExtractor embed : embeds) {
|
|
assertTrue(embed.getText().length() > 20);
|
|
if (embed instanceof PowerPointExtractor) numPpt++;
|
|
else if (embed instanceof ExcelExtractor) numXls++;
|
|
else if (embed instanceof WordExtractor) numWord++;
|
|
else if (embed instanceof OutlookTextExtactor) numMsg++;
|
|
}
|
|
assertEquals(0, numPpt);
|
|
assertEquals(0, numXls);
|
|
assertEquals(0, numWord);
|
|
assertEquals(1, numMsg);
|
|
ext.close();
|
|
|
|
// TODO - PowerPoint
|
|
// TODO - Publisher
|
|
// TODO - Visio
|
|
}
|
|
|
|
private static final String[] EXPECTED_FAILURES = new String[] {
|
|
// password protected files
|
|
"spreadsheet/password.xls",
|
|
"spreadsheet/protected_passtika.xlsx",
|
|
"spreadsheet/51832.xls",
|
|
"document/PasswordProtected.doc",
|
|
"slideshow/Password_Protected-hello.ppt",
|
|
"slideshow/Password_Protected-56-hello.ppt",
|
|
"slideshow/Password_Protected-np-hello.ppt",
|
|
"slideshow/cryptoapi-proc2356.ppt",
|
|
//"document/bug53475-password-is-pass.docx",
|
|
//"document/bug53475-password-is-solrcell.docx",
|
|
"spreadsheet/xor-encryption-abc.xls",
|
|
"spreadsheet/35897-type4.xls",
|
|
//"poifs/protect.xlsx",
|
|
//"poifs/protected_sha512.xlsx",
|
|
//"poifs/extenxls_pwd123.xlsx",
|
|
//"poifs/protected_agile.docx",
|
|
"spreadsheet/58616.xlsx",
|
|
|
|
// TODO: fails XMLExportTest, is this ok?
|
|
"spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx",
|
|
"spreadsheet/55864.xlsx",
|
|
"spreadsheet/57890.xlsx",
|
|
|
|
// TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()!
|
|
"spreadsheet/44958.xls",
|
|
"spreadsheet/44958_1.xls",
|
|
"spreadsheet/testArraysAndTables.xls",
|
|
|
|
// TODO: good to ignore?
|
|
"spreadsheet/sample-beta.xlsx",
|
|
|
|
// This is actually a spreadsheet!
|
|
"hpsf/TestRobert_Flaherty.doc",
|
|
|
|
// some files that are broken, eg Word 95, ...
|
|
"spreadsheet/43493.xls",
|
|
"spreadsheet/46904.xls",
|
|
"document/Bug50955.doc",
|
|
"slideshow/PPT95.ppt",
|
|
"openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx",
|
|
"openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx",
|
|
"openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx",
|
|
"openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx",
|
|
"openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx",
|
|
"openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx",
|
|
"openxml4j/OPCCompliance_DerivedPartNameFAIL.docx",
|
|
"openxml4j/invalid.xlsx",
|
|
"spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764()
|
|
"spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764()
|
|
"spreadsheet/Simple.xlsb",
|
|
"poifs/unknown_properties.msg", // POIFS properties corrupted
|
|
"poifs/only-zero-byte-streams.ole2", // No actual contents
|
|
"spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion
|
|
"spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion
|
|
|
|
// old Excel files, which we only support simple text extraction of
|
|
"spreadsheet/testEXCEL_2.xls",
|
|
"spreadsheet/testEXCEL_3.xls",
|
|
"spreadsheet/testEXCEL_4.xls",
|
|
"spreadsheet/testEXCEL_5.xls",
|
|
"spreadsheet/testEXCEL_95.xls",
|
|
|
|
// OOXML Strict is not yet supported, see bug #57699
|
|
"spreadsheet/SampleSS.strict.xlsx",
|
|
"spreadsheet/SimpleStrict.xlsx",
|
|
"spreadsheet/sample.strict.xlsx",
|
|
|
|
// non-TNEF files
|
|
"ddf/Container.dat",
|
|
"ddf/47143.dat",
|
|
|
|
// sheet cloning errors
|
|
"spreadsheet/47813.xlsx",
|
|
"spreadsheet/56450.xls",
|
|
"spreadsheet/57231_MixedGasReport.xls",
|
|
"spreadsheet/OddStyleRecord.xls",
|
|
"spreadsheet/WithChartSheet.xlsx",
|
|
"spreadsheet/chart_sheet.xlsx",
|
|
};
|
|
|
|
@Test
|
|
public void testFileLeak() throws Exception {
|
|
// run a number of files that might fail in order to catch
|
|
// leaked file resources when using file-leak-detector while
|
|
// running the test
|
|
|
|
for(String file : EXPECTED_FAILURES) {
|
|
try {
|
|
ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile(file));
|
|
} catch (Exception e) {
|
|
// catch all exceptions here as we are only interested in file-handle leaks
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* #59074 - Excel 95 files should give a helpful message, not just
|
|
* "No supported documents found in the OLE2 stream"
|
|
*/
|
|
@Test
|
|
public void bug59074() throws Exception {
|
|
try {
|
|
ExtractorFactory.createExtractor(
|
|
POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
|
|
fail("Old excel formats not supported via ExtractorFactory");
|
|
} catch (OldExcelFormatException e) {
|
|
// expected here
|
|
}
|
|
}
|
|
|
|
@Test
|
|
public void testGetEmbeddedFromXMLExtractor() {
|
|
try {
|
|
// currently not implemented
|
|
ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null);
|
|
fail("Unsupported currently");
|
|
} catch (IllegalStateException e) {
|
|
// expected here
|
|
}
|
|
}
|
|
|
|
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
|
|
// When this happens, change this from @Test(expected=...) to @Test
|
|
// bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor
|
|
@Test(expected=AssertionError.class)
|
|
public void test45565() throws Exception {
|
|
POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("45565.xls"));
|
|
try {
|
|
String text = extractor.getText();
|
|
assertContains(text, "testdoc");
|
|
assertContains(text, "test phrase");
|
|
} finally {
|
|
extractor.close();
|
|
}
|
|
}
|
|
}
|