458 lines
17 KiB
Java
458 lines
17 KiB
Java
/* ====================================================================
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
this work for additional information regarding copyright ownership.
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
(the "License"); you may not use this file except in compliance with
|
|
the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==================================================================== */
|
|
|
|
package org.apache.poi.hslf.extractor;
|
|
|
|
import static org.apache.poi.POITestCase.assertContains;
|
|
import static org.apache.poi.POITestCase.assertContainsIgnoreCase;
|
|
import static org.apache.poi.POITestCase.assertNotContained;
|
|
import static org.junit.Assert.assertEquals;
|
|
import static org.junit.Assert.assertFalse;
|
|
import static org.junit.Assert.assertNotNull;
|
|
import static org.junit.Assert.assertTrue;
|
|
|
|
import java.io.ByteArrayOutputStream;
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.util.List;
|
|
|
|
import org.apache.poi.POIDataSamples;
|
|
import org.apache.poi.hslf.model.OLEShape;
|
|
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
|
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
|
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
|
import org.apache.poi.hwpf.HWPFDocument;
|
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
|
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
|
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|
import org.apache.poi.util.IOUtils;
|
|
import org.junit.Test;
|
|
|
|
/**
|
|
* Tests that the extractor correctly gets the text out of our sample file
|
|
*/
|
|
public final class TestExtractor {
|
|
/**
|
|
* Extractor primed on the 2 page basic test data
|
|
*/
|
|
private static final String expectText = "This is a test title\nThis is a test subtitle\nThis is on page 1\nThis is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n";
|
|
|
|
/**
|
|
* Extractor primed on the 1 page but text-box'd test data
|
|
*/
|
|
private static final String expectText2 = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n";
|
|
|
|
/**
|
|
* Where our embeded files live
|
|
*/
|
|
private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
|
|
|
// @Before
|
|
// public void setUp() throws Exception {
|
|
// ppe = new PowerPointExtractor(slTests.getFile("basic_test_ppt_file.ppt").getCanonicalPath());
|
|
// ppe2 = new PowerPointExtractor(slTests.getFile("with_textbox.ppt").getCanonicalPath());
|
|
// }
|
|
|
|
// @After
|
|
// public void closeResources() throws Exception {
|
|
// ppe2.close();
|
|
// ppe.close();
|
|
// }
|
|
|
|
private PowerPointExtractor openExtractor(String fileName) throws IOException {
|
|
InputStream is = slTests.openResourceAsStream(fileName);
|
|
try {
|
|
return new PowerPointExtractor(is);
|
|
} finally {
|
|
is.close();
|
|
}
|
|
}
|
|
|
|
@Test
|
|
public void testReadSheetText() throws IOException {
|
|
// Basic 2 page example
|
|
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
|
|
ensureTwoStringsTheSame(expectText, ppe.getText());
|
|
ppe.close();
|
|
|
|
// 1 page example with text boxes
|
|
PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
|
|
ensureTwoStringsTheSame(expectText2, ppe2.getText());
|
|
ppe2.close();
|
|
}
|
|
|
|
@Test
|
|
public void testReadNoteText() throws IOException {
|
|
// Basic 2 page example
|
|
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
|
|
String notesText = ppe.getNotes();
|
|
String expText = "These are the notes for page 1\nThese are the notes on page two, again lacking formatting\n";
|
|
ensureTwoStringsTheSame(expText, notesText);
|
|
ppe.close();
|
|
|
|
// Other one doesn't have notes
|
|
PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
|
|
notesText = ppe2.getNotes();
|
|
expText = "";
|
|
ensureTwoStringsTheSame(expText, notesText);
|
|
ppe2.close();
|
|
}
|
|
|
|
@Test
|
|
public void testReadBoth() throws IOException {
|
|
String[] slText = new String[]{
|
|
"This is a test title\nThis is a test subtitle\nThis is on page 1\n",
|
|
"This is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n"
|
|
};
|
|
String[] ntText = new String[]{
|
|
"These are the notes for page 1\n",
|
|
"These are the notes on page two, again lacking formatting\n"
|
|
};
|
|
|
|
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
|
|
ppe.setSlidesByDefault(true);
|
|
ppe.setNotesByDefault(false);
|
|
assertEquals(slText[0] + slText[1], ppe.getText());
|
|
|
|
ppe.setSlidesByDefault(false);
|
|
ppe.setNotesByDefault(true);
|
|
assertEquals(ntText[0] + ntText[1], ppe.getText());
|
|
|
|
ppe.setSlidesByDefault(true);
|
|
ppe.setNotesByDefault(true);
|
|
assertEquals(slText[0] + slText[1] + "\n" + ntText[0] + ntText[1], ppe.getText());
|
|
ppe.close();
|
|
}
|
|
|
|
/**
|
|
* Test that when presented with a PPT file missing the odd
|
|
* core record, we can still get the rest of the text out
|
|
*
|
|
* @throws Exception
|
|
*/
|
|
@Test
|
|
public void testMissingCoreRecords() throws IOException {
|
|
PowerPointExtractor ppe = openExtractor("missing_core_records.ppt");
|
|
|
|
String text = ppe.getText(true, false);
|
|
String nText = ppe.getNotes();
|
|
|
|
assertNotNull(text);
|
|
assertNotNull(nText);
|
|
|
|
// Notes record were corrupt, so don't expect any
|
|
assertEquals(nText.length(), 0);
|
|
|
|
// Slide records were fine
|
|
assertContains(text, "Using Disease Surveillance and Response");
|
|
|
|
ppe.close();
|
|
}
|
|
|
|
private void ensureTwoStringsTheSame(String exp, String act) {
|
|
assertEquals(exp.length(), act.length());
|
|
char[] expC = exp.toCharArray();
|
|
char[] actC = act.toCharArray();
|
|
for (int i = 0; i < expC.length; i++) {
|
|
assertEquals("Char " + i, expC[i], actC[i]);
|
|
}
|
|
assertEquals(exp, act);
|
|
}
|
|
|
|
@Test
|
|
public void testExtractFromEmbeded() throws IOException {
|
|
InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
|
|
POIFSFileSystem fs = new POIFSFileSystem(is);
|
|
DirectoryNode root = fs.getRoot();
|
|
PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n");
|
|
PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n");
|
|
ppe2.close();
|
|
ppe1.close();
|
|
fs.close();
|
|
}
|
|
|
|
private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected)
|
|
throws IOException {
|
|
DirectoryNode dir = (DirectoryNode)root.getEntry(entryName);
|
|
assertTrue(dir.hasEntry("PowerPoint Document"));
|
|
|
|
// Check the first file
|
|
HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir);
|
|
PowerPointExtractor ppe = new PowerPointExtractor(ppt);
|
|
assertEquals(expected, ppe.getText(true, false));
|
|
return ppe;
|
|
}
|
|
|
|
/**
|
|
* A powerpoint file with embeded powerpoint files
|
|
*/
|
|
@Test
|
|
public void testExtractFromOwnEmbeded() throws IOException {
|
|
PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt");
|
|
List<OLEShape> shapes = ppe.getOLEShapes();
|
|
assertEquals("Expected 6 ole shapes", 6, shapes.size());
|
|
int num_ppt = 0, num_doc = 0, num_xls = 0;
|
|
for (OLEShape ole : shapes) {
|
|
String name = ole.getInstanceName();
|
|
InputStream data = ole.getObjectData().getData();
|
|
if ("Worksheet".equals(name)) {
|
|
HSSFWorkbook wb = new HSSFWorkbook(data);
|
|
num_xls++;
|
|
wb.close();
|
|
} else if ("Document".equals(name)) {
|
|
HWPFDocument doc = new HWPFDocument(data);
|
|
num_doc++;
|
|
doc.close();
|
|
} else if ("Presentation".equals(name)) {
|
|
num_ppt++;
|
|
HSLFSlideShow ppt = new HSLFSlideShow(data);
|
|
ppt.close();
|
|
}
|
|
data.close();
|
|
}
|
|
assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
|
|
assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
|
|
assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
|
|
ppe.close();
|
|
}
|
|
|
|
/**
|
|
* A powerpoint file with embeded powerpoint files
|
|
*/
|
|
@Test
|
|
public void test52991() throws IOException {
|
|
PowerPointExtractor ppe = openExtractor("badzip.ppt");
|
|
for (OLEShape shape : ppe.getOLEShapes()) {
|
|
IOUtils.copy(shape.getObjectData().getData(), new ByteArrayOutputStream());
|
|
}
|
|
ppe.close();
|
|
}
|
|
|
|
/**
|
|
* From bug #45543
|
|
*/
|
|
@Test
|
|
public void testWithComments() throws IOException {
|
|
PowerPointExtractor ppe1 = openExtractor("WithComments.ppt");
|
|
String text = ppe1.getText();
|
|
assertFalse("Comments not in by default", text.contains("This is a test comment"));
|
|
|
|
ppe1.setCommentsByDefault(true);
|
|
|
|
text = ppe1.getText();
|
|
assertContains(text, "This is a test comment");
|
|
ppe1.close();
|
|
|
|
|
|
// And another file
|
|
PowerPointExtractor ppe2 = openExtractor("45543.ppt");
|
|
text = ppe2.getText();
|
|
assertFalse("Comments not in by default", text.contains("testdoc"));
|
|
|
|
ppe2.setCommentsByDefault(true);
|
|
|
|
text = ppe2.getText();
|
|
assertContains(text, "testdoc");
|
|
ppe2.close();
|
|
}
|
|
|
|
/**
|
|
* From bug #45537
|
|
*/
|
|
@Test
|
|
public void testHeaderFooter() throws IOException {
|
|
String text;
|
|
|
|
// With a header on the notes
|
|
InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt");
|
|
HSLFSlideShow ppt1 = new HSLFSlideShow(is1);
|
|
is1.close();
|
|
assertNotNull(ppt1.getNotesHeadersFooters());
|
|
assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText());
|
|
|
|
PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl());
|
|
|
|
text = ppe1.getText();
|
|
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
|
|
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
|
|
|
|
ppe1.setNotesByDefault(true);
|
|
text = ppe1.getText();
|
|
assertContains(text, "testdoc");
|
|
assertContains(text, "test phrase");
|
|
ppe1.close();
|
|
ppt1.close();
|
|
|
|
// And with a footer, also on notes
|
|
InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt");
|
|
HSLFSlideShow ppt2 = new HSLFSlideShow(is2);
|
|
is2.close();
|
|
|
|
assertNotNull(ppt2.getNotesHeadersFooters());
|
|
assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText());
|
|
ppt2.close();
|
|
|
|
PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt");
|
|
|
|
text = ppe2.getText();
|
|
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
|
|
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
|
|
|
|
ppe2.setNotesByDefault(true);
|
|
text = ppe2.getText();
|
|
assertContains(text, "testdoc");
|
|
assertContains(text, "test phrase");
|
|
ppe2.close();
|
|
}
|
|
|
|
@SuppressWarnings("unused")
|
|
@Test
|
|
public void testSlideMasterText() throws IOException {
|
|
String masterTitleText = "This is the Master Title";
|
|
String masterRandomText = "This text comes from the Master Slide";
|
|
String masterFooterText = "Footer from the master slide";
|
|
PowerPointExtractor ppe = openExtractor("WithMaster.ppt");
|
|
ppe.setMasterByDefault(true);
|
|
|
|
String text = ppe.getText();
|
|
assertContains(text, masterRandomText);
|
|
assertContains(text, masterFooterText);
|
|
ppe.close();
|
|
}
|
|
|
|
@Test
|
|
public void testMasterText() throws IOException {
|
|
PowerPointExtractor ppe1 = openExtractor("master_text.ppt");
|
|
|
|
// Initially not there
|
|
String text = ppe1.getText();
|
|
assertFalse(text.contains("Text that I added to the master slide"));
|
|
|
|
// Enable, shows up
|
|
ppe1.setMasterByDefault(true);
|
|
text = ppe1.getText();
|
|
assertContains(text, "Text that I added to the master slide");
|
|
|
|
// Make sure placeholder text does not come out
|
|
assertNotContained(text, "Click to edit Master");
|
|
ppe1.close();
|
|
|
|
// Now with another file only containing master text
|
|
// Will always show up
|
|
PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt");
|
|
String masterText = "Footer from the master slide";
|
|
|
|
text = ppe2.getText();
|
|
assertContainsIgnoreCase(text, "master");
|
|
assertContains(text, masterText);
|
|
ppe2.close();
|
|
}
|
|
|
|
/**
|
|
* Bug #54880 Chinese text not extracted properly
|
|
*/
|
|
@Test
|
|
public void testChineseText() throws IOException {
|
|
PowerPointExtractor ppe = openExtractor("54880_chinese.ppt");
|
|
|
|
String text = ppe.getText();
|
|
|
|
// Check for the english text line
|
|
assertContains(text, "Single byte");
|
|
|
|
// Check for the english text in the mixed line
|
|
assertContains(text, "Mix");
|
|
|
|
// Check for the chinese text in the mixed line
|
|
assertContains(text, "\u8868");
|
|
|
|
// Check for the chinese only text line
|
|
assertContains(text, "\uff8a\uff9d\uff76\uff78");
|
|
ppe.close();
|
|
}
|
|
|
|
/**
|
|
* Tests that we can work with both {@link POIFSFileSystem}
|
|
* and {@link NPOIFSFileSystem}
|
|
*/
|
|
@SuppressWarnings("resource")
|
|
@Test
|
|
public void testDifferentPOIFS() throws IOException {
|
|
// Open the two filesystems
|
|
File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
|
|
InputStream is1 = new FileInputStream(pptFile);
|
|
OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
|
|
is1.close();
|
|
NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile);
|
|
|
|
DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };
|
|
|
|
// Open directly
|
|
for (DirectoryNode dir : files) {
|
|
PowerPointExtractor extractor = new PowerPointExtractor(dir);
|
|
assertEquals(expectText, extractor.getText());
|
|
}
|
|
|
|
// Open via a HSLFSlideShow
|
|
for (DirectoryNode dir : files) {
|
|
HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
|
|
PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
|
|
assertEquals(expectText, extractor.getText());
|
|
extractor.close();
|
|
slideshow.close();
|
|
}
|
|
|
|
npoifs.close();
|
|
}
|
|
|
|
@Test
|
|
public void testTable() throws Exception {
|
|
PowerPointExtractor ppe1 = openExtractor("54111.ppt");
|
|
String text1 = ppe1.getText();
|
|
String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+
|
|
"Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n"+
|
|
"Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n"+
|
|
"Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n"+
|
|
"Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n"+
|
|
"Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
|
|
assertTrue(text1.contains(target1));
|
|
ppe1.close();
|
|
|
|
PowerPointExtractor ppe2 = openExtractor("54722.ppt");
|
|
String text2 = ppe2.getText();
|
|
|
|
String target2 = "this\tText\tis\twithin\ta\n" +
|
|
"table\t1\t2\t3\t4";
|
|
assertTrue(text2.contains(target2));
|
|
ppe2.close();
|
|
}
|
|
|
|
// bug 60003
|
|
@Test
|
|
public void testExtractMasterSlideFooterText() throws Exception {
|
|
PowerPointExtractor ppe = openExtractor("60003.ppt");
|
|
ppe.setMasterByDefault(true);
|
|
|
|
String text = ppe.getText();
|
|
assertContains(text, "Prague");
|
|
ppe.close();
|
|
}
|
|
}
|