176 lines
5.4 KiB
Java
176 lines
5.4 KiB
Java
/* ====================================================================
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
this work for additional information regarding copyright ownership.
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
(the "License"); you may not use this file except in compliance with
|
|
the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==================================================================== */
|
|
|
|
package org.apache.poi.hpbf.extractor;
|
|
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
|
|
import junit.framework.TestCase;
|
|
|
|
import org.apache.poi.POIDataSamples;
|
|
import org.apache.poi.hpbf.HPBFDocument;
|
|
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
|
|
|
public final class TestPublisherTextExtractor extends TestCase {
|
|
private static final POIDataSamples _samples = POIDataSamples.getPublisherInstance();
|
|
|
|
private static final String SAMPLE_TEXT =
|
|
"This is some text on the first page\n" +
|
|
"It\u2019s in times new roman, font size 10, all normal\n" +
|
|
"" +
|
|
"This is in bold and italic\n" +
|
|
"It\u2019s Arial, 20 point font\n" +
|
|
"It\u2019s in the second textbox on the first page\n" +
|
|
"" +
|
|
"This is the second page\n\n" +
|
|
"" +
|
|
"It is also times new roman, 10 point\n" +
|
|
"" +
|
|
"Table on page 2\nTop right\n" +
|
|
"P2 table left\nP2 table right\n" +
|
|
"Bottom Left\nBottom Right\n" +
|
|
"" +
|
|
"This text is on page two\n" +
|
|
"#This is a link to Apache POI\n" +
|
|
"More normal text\n" +
|
|
"Link to a file\n" +
|
|
"" +
|
|
"More text, more hyperlinks\n" +
|
|
"email link\n" +
|
|
"Final hyperlink\n" +
|
|
"Within doc to page 1\n";
|
|
private static final String SIMPLE_TEXT =
|
|
"0123456789\n" +
|
|
"0123456789abcdef\n" +
|
|
"0123456789abcdef0123456789abcdef\n" +
|
|
"0123456789\n" +
|
|
"0123456789abcdef\n" +
|
|
"0123456789abcdef0123456789abcdef\n" +
|
|
"0123456789abcdef0123456789abcdef0123456789abcdef\n";
|
|
|
|
public void testBasics() throws Exception {
|
|
HPBFDocument doc = new HPBFDocument(
|
|
_samples.openResourceAsStream("Sample.pub")
|
|
);
|
|
|
|
PublisherTextExtractor ext =
|
|
new PublisherTextExtractor(doc);
|
|
ext.getText();
|
|
|
|
ext = new PublisherTextExtractor(
|
|
_samples.openResourceAsStream("Simple.pub")
|
|
);
|
|
ext.getText();
|
|
}
|
|
|
|
public void testContents() throws Exception {
|
|
PublisherTextExtractor ext;
|
|
File sample = _samples.getFile("Sample.pub");
|
|
File simple = _samples.getFile("Simple.pub");
|
|
|
|
// Check this complicated file using POIFS
|
|
HPBFDocument docOPOIFS = new HPBFDocument(
|
|
new FileInputStream(sample)
|
|
);
|
|
ext = new PublisherTextExtractor(docOPOIFS);
|
|
assertEquals( SAMPLE_TEXT, ext.getText() );
|
|
|
|
// And with NPOIFS
|
|
NPOIFSFileSystem fs = new NPOIFSFileSystem(sample);
|
|
HPBFDocument docNPOIFS = new HPBFDocument(
|
|
fs
|
|
);
|
|
ext = new PublisherTextExtractor(docNPOIFS);
|
|
assertEquals( SAMPLE_TEXT, ext.getText() );
|
|
|
|
|
|
// Now a simpler file
|
|
ext = new PublisherTextExtractor(
|
|
new FileInputStream(simple)
|
|
);
|
|
assertEquals( SIMPLE_TEXT, ext.getText() );
|
|
fs.close();
|
|
}
|
|
|
|
/**
|
|
* We have the same file saved for Publisher 98, Publisher
|
|
* 2000 and Publisher 2007. Check they all agree.
|
|
* @throws Exception
|
|
*/
|
|
public void testMultipleVersions() throws Exception {
|
|
File f;
|
|
HPBFDocument doc;
|
|
|
|
doc = new HPBFDocument(
|
|
_samples.openResourceAsStream("Sample.pub")
|
|
);
|
|
String s2007 = (new PublisherTextExtractor(doc)).getText();
|
|
|
|
doc = new HPBFDocument(
|
|
_samples.openResourceAsStream("Sample2000.pub")
|
|
);
|
|
String s2000 = (new PublisherTextExtractor(doc)).getText();
|
|
|
|
doc = new HPBFDocument(
|
|
_samples.openResourceAsStream("Sample98.pub")
|
|
);
|
|
String s98 = (new PublisherTextExtractor(doc)).getText();
|
|
|
|
// Check they all agree
|
|
assertEquals(s2007, s2000);
|
|
assertEquals(s2007, s98);
|
|
}
|
|
|
|
/**
|
|
* Test that the hyperlink extraction stuff works as well
|
|
* as we can hope it to.
|
|
*/
|
|
public void testWithHyperlinks() throws Exception {
|
|
HPBFDocument doc = new HPBFDocument(
|
|
_samples.openResourceAsStream("LinkAt10.pub")
|
|
);
|
|
|
|
PublisherTextExtractor ext =
|
|
new PublisherTextExtractor(doc);
|
|
ext.getText();
|
|
|
|
// Default is no hyperlinks
|
|
assertEquals("1234567890LINK\n", ext.getText());
|
|
|
|
// Turn on
|
|
ext.setHyperlinksByDefault(true);
|
|
assertEquals("1234567890LINK\n<http://poi.apache.org/>\n", ext.getText());
|
|
|
|
|
|
// Now a much more complex document
|
|
ext = new PublisherTextExtractor(
|
|
_samples.openResourceAsStream("Sample.pub")
|
|
);
|
|
ext.setHyperlinksByDefault(true);
|
|
String text = ext.getText();
|
|
|
|
assertTrue(text.endsWith(
|
|
"<http://poi.apache.org/>\n" +
|
|
"<C:\\Documents and Settings\\Nick\\My Documents\\Booleans.xlsx>\n" +
|
|
"<>\n" +
|
|
"<mailto:dev@poi.apache.org?subject=HPBF>\n" +
|
|
"<mailto:dev@poi.apache.org?subject=HPBF>\n"
|
|
));
|
|
}
|
|
}
|