HPBF text extractor and unit tests
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@687443 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
22fae5f0a9
commit
c771ce5a33
@ -0,0 +1,78 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hpbf.extractor;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hpbf.HPBFDocument;
|
||||
import org.apache.poi.hpbf.model.qcbits.QCBit;
|
||||
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
* Extract text from HPBF Publisher files
|
||||
*/
|
||||
public class PublisherTextExtractor extends POIOLE2TextExtractor {
|
||||
private HPBFDocument doc;
|
||||
|
||||
public PublisherTextExtractor(HPBFDocument doc) {
|
||||
super(doc);
|
||||
this.doc = doc;
|
||||
}
|
||||
public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(new HPBFDocument(fs));
|
||||
}
|
||||
public PublisherTextExtractor(InputStream is) throws IOException {
|
||||
this(new POIFSFileSystem(is));
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
// Get the text from the Quill Contents
|
||||
QCBit[] bits = doc.getQuillContents().getBits();
|
||||
for(int i=0; i<bits.length; i++) {
|
||||
if(bits[i] != null && bits[i] instanceof QCTextBit) {
|
||||
QCTextBit t = (QCTextBit)bits[i];
|
||||
text.append( t.getText().replace('\r', '\n') );
|
||||
}
|
||||
}
|
||||
|
||||
// Get more text
|
||||
// TODO
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if(args.length == 0) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" PublisherTextExtractor <file.pub>");
|
||||
}
|
||||
|
||||
for(int i=0; i<args.length; i++) {
|
||||
PublisherTextExtractor te = new PublisherTextExtractor(
|
||||
new FileInputStream(args[i])
|
||||
);
|
||||
System.out.println(te.getText());
|
||||
}
|
||||
}
|
||||
}
|
@ -26,6 +26,10 @@ public class QCTextBit extends QCBit {
|
||||
super(thingType, bitType, data);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the text. Note that line endings
|
||||
* are \r and not \n
|
||||
*/
|
||||
public String getText() {
|
||||
return StringUtil.getFromUnicodeLE(
|
||||
data, 0, data.length/2
|
||||
|
BIN
src/scratchpad/testcases/org/apache/poi/hpbf/data/Simple.pub
Executable file
BIN
src/scratchpad/testcases/org/apache/poi/hpbf/data/Simple.pub
Executable file
Binary file not shown.
@ -0,0 +1,105 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hpbf.extractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
|
||||
import org.apache.poi.hpbf.HPBFDocument;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TextPublisherTextExtractor extends TestCase {
|
||||
private String dir;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
dir = System.getProperty("HPBF.testdata.path");
|
||||
}
|
||||
|
||||
public void testBasics() throws Exception {
|
||||
File f = new File(dir, "Sample.pub");
|
||||
HPBFDocument doc = new HPBFDocument(
|
||||
new FileInputStream(f)
|
||||
);
|
||||
|
||||
PublisherTextExtractor ext =
|
||||
new PublisherTextExtractor(doc);
|
||||
ext.getText();
|
||||
|
||||
f = new File(dir, "Simple.pub");
|
||||
ext = new PublisherTextExtractor(
|
||||
new FileInputStream(f)
|
||||
);
|
||||
ext.getText();
|
||||
}
|
||||
|
||||
public void testContents() throws Exception {
|
||||
File f = new File(dir, "Sample.pub");
|
||||
HPBFDocument doc = new HPBFDocument(
|
||||
new FileInputStream(f)
|
||||
);
|
||||
|
||||
PublisherTextExtractor ext =
|
||||
new PublisherTextExtractor(doc);
|
||||
String text = ext.getText();
|
||||
|
||||
assertEquals(
|
||||
"This is some text on the first page\n" +
|
||||
"It’s in times new roman, font size 10, all normal\n" +
|
||||
"" +
|
||||
"This is in bold and italic\n" +
|
||||
"It’s Arial, 20 point font\n" +
|
||||
"It’s in the second textbox on the first page\n" +
|
||||
"" +
|
||||
"This is the second page\n\n" +
|
||||
"" +
|
||||
"It is also times new roman, 10 point\n" +
|
||||
"" +
|
||||
"Table on page 2\nTop right\n" +
|
||||
"P2 table left\nP2 table right\n" +
|
||||
"Bottom Left\nBottom Right\n" +
|
||||
"" +
|
||||
"This text is on page two\n" +
|
||||
"#This is a link to Apache POI\n" +
|
||||
"More normal text\n" +
|
||||
"Link to a file\n" +
|
||||
"" +
|
||||
"More text, more hyperlinks\n" +
|
||||
"email link\n" +
|
||||
"Final hyperlink\n" +
|
||||
"Within doc to page 1\n"
|
||||
, text
|
||||
);
|
||||
|
||||
// Now a simpler one
|
||||
f = new File(dir, "Simple.pub");
|
||||
ext = new PublisherTextExtractor(
|
||||
new FileInputStream(f)
|
||||
);
|
||||
text = ext.getText();
|
||||
assertEquals(
|
||||
"0123456789\n" +
|
||||
"0123456789abcdef\n" +
|
||||
"0123456789abcdef0123456789abcdef\n" +
|
||||
"0123456789\n" +
|
||||
"0123456789abcdef\n" +
|
||||
"0123456789abcdef0123456789abcdef\n" +
|
||||
"0123456789abcdef0123456789abcdef0123456789abcdef\n"
|
||||
, text
|
||||
);
|
||||
}
|
||||
}
|
@ -0,0 +1,50 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hpbf.model;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
|
||||
import org.apache.poi.hpbf.HPBFDocument;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TestEscherParts extends TestCase {
|
||||
private String dir;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
dir = System.getProperty("HPBF.testdata.path");
|
||||
}
|
||||
|
||||
public void testBasics() throws Exception {
|
||||
File f = new File(dir, "Sample.pub");
|
||||
HPBFDocument doc = new HPBFDocument(
|
||||
new FileInputStream(f)
|
||||
);
|
||||
|
||||
EscherStm es = doc.getEscherStm();
|
||||
EscherDelayStm eds = doc.getEscherDelayStm();
|
||||
|
||||
assertNotNull(es);
|
||||
assertNotNull(eds);
|
||||
|
||||
assertEquals(13, es.getEscherRecords().length);
|
||||
assertEquals(0, eds.getEscherRecords().length);
|
||||
|
||||
// TODO - check the contents
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user