356 lines
13 KiB
Java
356 lines
13 KiB
Java
/* ====================================================================
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
this work for additional information regarding copyright ownership.
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
(the "License"); you may not use this file except in compliance with
|
|
the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==================================================================== */
|
|
|
|
package org.apache.poi.xwpf.extractor;
|
|
|
|
import java.io.IOException;
|
|
|
|
import junit.framework.TestCase;
|
|
|
|
import org.apache.poi.xwpf.XWPFTestDataSamples;
|
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
|
|
|
/**
|
|
* Tests for HXFWordExtractor
|
|
*/
|
|
public class TestXWPFWordExtractor extends TestCase {
|
|
|
|
/**
|
|
* Get text out of the simple file
|
|
* @throws IOException
|
|
*/
|
|
public void testGetSimpleText() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("sample.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
|
|
String text = extractor.getText();
|
|
assertTrue(text.length() > 0);
|
|
|
|
// Check contents
|
|
assertTrue(text.startsWith(
|
|
"Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
|
|
));
|
|
assertTrue(text.endsWith(
|
|
"Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
|
|
));
|
|
|
|
// Check number of paragraphs
|
|
int ps = 0;
|
|
char[] t = text.toCharArray();
|
|
for (int i = 0; i < t.length; i++) {
|
|
if (t[i] == '\n') {
|
|
ps++;
|
|
}
|
|
}
|
|
assertEquals(3, ps);
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
/**
|
|
* Tests getting the text out of a complex file
|
|
* @throws IOException
|
|
*/
|
|
public void testGetComplexText() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("IllustrativeCases.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
|
|
String text = extractor.getText();
|
|
assertTrue(text.length() > 0);
|
|
|
|
char euro = '\u20ac';
|
|
// System.err.println("'"+text.substring(text.length() - 40) + "'");
|
|
|
|
// Check contents
|
|
assertTrue(text.startsWith(
|
|
" \n(V) ILLUSTRATIVE CASES\n\n"
|
|
));
|
|
assertTrue(text.contains(
|
|
"As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n"// \n\n\n"
|
|
));
|
|
assertTrue(text.endsWith(
|
|
"11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n \n\n\n"
|
|
));
|
|
|
|
// Check number of paragraphs
|
|
int ps = 0;
|
|
char[] t = text.toCharArray();
|
|
for (int i = 0; i < t.length; i++) {
|
|
if (t[i] == '\n') {
|
|
ps++;
|
|
}
|
|
}
|
|
assertEquals(134, ps);
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
public void testGetWithHyperlinks() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("TestDocument.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
|
|
// Now check contents
|
|
extractor.setFetchHyperlinks(false);
|
|
assertEquals(
|
|
"This is a test document.\nThis bit is in bold and italic\n" +
|
|
"Back to normal\n" +
|
|
"This contains BOLD, ITALIC and BOTH, as well as RED and YELLOW text.\n" +
|
|
"We have a hyperlink here, and another.\n",
|
|
extractor.getText()
|
|
);
|
|
|
|
// One hyperlink is a real one, one is just to the top of page
|
|
extractor.setFetchHyperlinks(true);
|
|
assertEquals(
|
|
"This is a test document.\nThis bit is in bold and italic\n" +
|
|
"Back to normal\n" +
|
|
"This contains BOLD, ITALIC and BOTH, as well as RED and YELLOW text.\n" +
|
|
"We have a hyperlink <http://poi.apache.org/> here, and another.\n",
|
|
extractor.getText()
|
|
);
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
public void testHeadersFooters() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("ThreeColHeadFoot.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
|
|
assertEquals(
|
|
"First header column!\tMid header\tRight header!\n" +
|
|
"This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" +
|
|
"\n" +
|
|
"HEADING TEXT\n" +
|
|
"\n" +
|
|
"More on page one\n" +
|
|
"\n\n" +
|
|
"End of page 1\n\n\n" +
|
|
"This is page two. It also has a three column heading, and a three column footer.\n" +
|
|
"Footer Left\tFooter Middle\tFooter Right\n",
|
|
extractor.getText()
|
|
);
|
|
|
|
// Now another file, expect multiple headers
|
|
// and multiple footers
|
|
doc = XWPFTestDataSamples.openSampleDocument("DiffFirstPageHeadFoot.docx");
|
|
extractor.close();
|
|
|
|
extractor = new XWPFWordExtractor(doc);
|
|
extractor.close();
|
|
|
|
extractor =
|
|
new XWPFWordExtractor(doc);
|
|
extractor.getText();
|
|
|
|
assertEquals(
|
|
"I am the header on the first page, and I" + '\u2019' + "m nice and simple\n" +
|
|
"First header column!\tMid header\tRight header!\n" +
|
|
"This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" +
|
|
"\n" +
|
|
"HEADING TEXT\n" +
|
|
"\n" +
|
|
"More on page one\n" +
|
|
"\n\n" +
|
|
"End of page 1\n\n\n" +
|
|
"This is page two. It also has a three column heading, and a three column footer.\n" +
|
|
"The footer of the first page\n" +
|
|
"Footer Left\tFooter Middle\tFooter Right\n",
|
|
extractor.getText()
|
|
);
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
public void testFootnotes() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("footnotes.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
String text = extractor.getText();
|
|
assertTrue(text.contains("snoska"));
|
|
assertTrue(text.contains("Eto ochen prostoy[footnoteRef:1] text so snoskoy"));
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
|
|
public void testTableFootnotes() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("table_footnotes.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
|
|
assertTrue(extractor.getText().contains("snoska"));
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
public void testFormFootnotes() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("form_footnotes.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
|
|
String text = extractor.getText();
|
|
assertTrue("Unable to find expected word in text\n" + text, text.contains("testdoc"));
|
|
assertTrue("Unable to find expected word in text\n" + text, text.contains("test phrase"));
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
public void testEndnotes() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("endnotes.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
String text = extractor.getText();
|
|
assertTrue(text.contains("XXX"));
|
|
assertTrue(text.contains("tilaka [endnoteRef:2]or 'tika'"));
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
public void testInsertedDeletedText() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("delins.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
|
|
assertTrue(extractor.getText().contains("pendant worn"));
|
|
assertTrue(extractor.getText().contains("extremely well"));
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
public void testParagraphHeader() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Headers.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
|
|
assertTrue(extractor.getText().contains("Section 1"));
|
|
assertTrue(extractor.getText().contains("Section 2"));
|
|
assertTrue(extractor.getText().contains("Section 3"));
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
/**
|
|
* Test that we can open and process .docm
|
|
* (macro enabled) docx files (bug #45690)
|
|
* @throws IOException
|
|
*/
|
|
public void testDOCMFiles() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("45690.docm");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
|
|
assertTrue(extractor.getText().contains("2004"));
|
|
assertTrue(extractor.getText().contains("2008"));
|
|
assertTrue(extractor.getText().contains("(120 "));
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
/**
|
|
* Test that we handle things like tabs and
|
|
* carriage returns properly in the text that
|
|
* we're extracting (bug #49189)
|
|
* @throws IOException
|
|
*/
|
|
public void testDocTabs() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("WithTabs.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
|
|
// Check bits
|
|
assertTrue(extractor.getText().contains("a"));
|
|
assertTrue(extractor.getText().contains("\t"));
|
|
assertTrue(extractor.getText().contains("b"));
|
|
|
|
// Now check the first paragraph in total
|
|
assertTrue(extractor.getText().contains("a\tb\n"));
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
/**
|
|
* The output should not contain field codes, e.g. those specified in the
|
|
* w:instrText tag (spec sec. 17.16.23)
|
|
* @throws IOException
|
|
*/
|
|
public void testNoFieldCodes() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("FieldCodes.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
String text = extractor.getText();
|
|
assertTrue(text.length() > 0);
|
|
assertFalse(text.contains("AUTHOR"));
|
|
assertFalse(text.contains("CREATEDATE"));
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
/**
|
|
* The output should contain the values of simple fields, those specified
|
|
* with the fldSimple element (spec sec. 17.16.19)
|
|
* @throws IOException
|
|
*/
|
|
public void testFldSimpleContent() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("FldSimple.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
String text = extractor.getText();
|
|
assertTrue(text.length() > 0);
|
|
assertTrue(text.contains("FldSimple.docx"));
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
/**
|
|
* Test for parsing document with drawings to prevent
|
|
* NoClassDefFoundError for CTAnchor in XWPFRun
|
|
*/
|
|
public void testDrawings() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("drawing.docx");
|
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
|
String text = extractor.getText();
|
|
assertTrue(text.length() > 0);
|
|
|
|
extractor.close();
|
|
}
|
|
|
|
/**
|
|
* Test for basic extraction of SDT content
|
|
* @throws IOException
|
|
*/
|
|
public void testSimpleControlContent() throws IOException {
|
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx");
|
|
String[] targs = new String[]{
|
|
"header_rich_text",
|
|
"rich_text",
|
|
"rich_text_pre_table\nrich_text_cell1\t\t\t\n\nrich_text_post_table",
|
|
"plain_text_no_newlines",
|
|
"plain_text_with_newlines1\nplain_text_with_newlines2\n",
|
|
"watermelon\n",
|
|
"dirt\n",
|
|
"4/16/2013\n",
|
|
"rich_text_in_paragraph_in_cell",
|
|
"footer_rich_text",
|
|
"footnote_sdt",
|
|
"endnote_sdt"
|
|
};
|
|
XWPFWordExtractor ex = new XWPFWordExtractor(doc);
|
|
String s = ex.getText().toLowerCase();
|
|
int hits = 0;
|
|
|
|
for (String targ : targs){
|
|
boolean hit = false;
|
|
if (s.indexOf(targ) > -1){
|
|
hit = true;
|
|
hits++;
|
|
}
|
|
assertEquals("controlled content loading-"+targ, true, hit);
|
|
}
|
|
assertEquals("controlled content loading hit count", targs.length, hits);
|
|
}
|
|
}
|