2007-12-30 11:53:42 -05:00
/ * = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
Licensed to the Apache Software Foundation ( ASF ) under one or more
contributor license agreements . See the NOTICE file distributed with
this work for additional information regarding copyright ownership .
The ASF licenses this file to You under the Apache License , Version 2 . 0
( the " License " ) ; you may not use this file except in compliance with
the License . You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an " AS IS " BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = * /
2008-03-09 10:39:36 -04:00
package org.apache.poi.xwpf.extractor ;
2007-12-30 11:53:42 -05:00
import java.io.File ;
2008-03-09 10:39:36 -04:00
import org.apache.poi.POIXMLDocument ;
import org.apache.poi.xwpf.XWPFDocument ;
2007-12-30 11:53:42 -05:00
import junit.framework.TestCase ;
/ * *
* Tests for HXFWordExtractor
* /
2008-03-09 10:39:36 -04:00
public class TestXWPFWordExtractor extends TestCase {
2007-12-30 11:53:42 -05:00
/ * *
* A very simple file
* /
2008-03-09 10:39:36 -04:00
private XWPFDocument xmlA ;
private File fileA ;
2007-12-30 11:53:42 -05:00
/ * *
* A fairly complex file
* /
2008-03-09 10:39:36 -04:00
private XWPFDocument xmlB ;
private File fileB ;
2008-04-09 08:22:23 -04:00
/ * *
* File with hyperlinks
* /
private XWPFDocument xmlC ;
private File fileC ;
2007-12-30 11:53:42 -05:00
protected void setUp ( ) throws Exception {
super . setUp ( ) ;
2008-03-09 10:39:36 -04:00
fileA = new File (
2007-12-30 11:53:42 -05:00
System . getProperty ( " HWPF.testdata.path " ) +
File . separator + " sample.docx "
) ;
2008-03-09 10:39:36 -04:00
fileB = new File (
2007-12-30 11:53:42 -05:00
System . getProperty ( " HWPF.testdata.path " ) +
File . separator + " IllustrativeCases.docx "
) ;
2008-04-09 08:22:23 -04:00
fileC = new File (
System . getProperty ( " HWPF.testdata.path " ) +
File . separator + " TestDocument.docx "
) ;
2008-03-09 10:39:36 -04:00
assertTrue ( fileA . exists ( ) ) ;
assertTrue ( fileB . exists ( ) ) ;
2008-04-09 08:22:23 -04:00
assertTrue ( fileC . exists ( ) ) ;
2007-12-30 11:53:42 -05:00
2008-03-09 10:39:36 -04:00
xmlA = new XWPFDocument ( POIXMLDocument . openPackage ( fileA . toString ( ) ) ) ;
xmlB = new XWPFDocument ( POIXMLDocument . openPackage ( fileB . toString ( ) ) ) ;
2008-04-09 08:22:23 -04:00
xmlC = new XWPFDocument ( POIXMLDocument . openPackage ( fileC . toString ( ) ) ) ;
2007-12-30 11:53:42 -05:00
}
/ * *
* Get text out of the simple file
* /
public void testGetSimpleText ( ) throws Exception {
2008-03-09 10:39:36 -04:00
new XWPFWordExtractor ( xmlA ) ;
new XWPFWordExtractor ( POIXMLDocument . openPackage ( fileA . toString ( ) ) ) ;
2007-12-30 11:53:42 -05:00
2008-03-09 10:39:36 -04:00
XWPFWordExtractor extractor =
new XWPFWordExtractor ( xmlA ) ;
2007-12-30 11:53:42 -05:00
extractor . getText ( ) ;
String text = extractor . getText ( ) ;
assertTrue ( text . length ( ) > 0 ) ;
// Check contents
assertTrue ( text . startsWith (
" Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio. "
) ) ;
assertTrue ( text . endsWith (
" Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum. \ n "
) ) ;
// Check number of paragraphs
int ps = 0 ;
char [ ] t = text . toCharArray ( ) ;
for ( int i = 0 ; i < t . length ; i + + ) {
if ( t [ i ] = = '\n' ) { ps + + ; }
}
assertEquals ( 3 , ps ) ;
}
/ * *
* Tests getting the text out of a complex file
* /
public void testGetComplexText ( ) throws Exception {
2008-03-09 10:39:36 -04:00
XWPFWordExtractor extractor =
new XWPFWordExtractor ( xmlB ) ;
2007-12-30 11:53:42 -05:00
extractor . getText ( ) ;
String text = extractor . getText ( ) ;
assertTrue ( text . length ( ) > 0 ) ;
char euro = '\u20ac' ;
System . err . println ( " ' " + text . substring ( text . length ( ) - 20 ) + " ' " ) ;
// Check contents
assertTrue ( text . startsWith (
" \ n(V) ILLUSTRATIVE CASES \ n \ n "
) ) ;
2008-08-09 06:45:19 -04:00
assertTrue ( text . contains (
2007-12-30 11:53:42 -05:00
" As well as gaining " + euro + " 90 from child benefit increases, he will also receive the early childhood supplement of " + euro + " 250 per quarter for Vincent for the full four quarters of the year. \ n \ n \ n \ n \ n \ n \ n "
) ) ;
2008-08-09 06:45:19 -04:00
assertTrue ( text . endsWith (
" 11.4% \ t \ t90 \ t \ t \ t \ t \ t250 \ t \ t1,310 \ t \ t \ n \ n "
) ) ;
2007-12-30 11:53:42 -05:00
// Check number of paragraphs
int ps = 0 ;
char [ ] t = text . toCharArray ( ) ;
for ( int i = 0 ; i < t . length ; i + + ) {
if ( t [ i ] = = '\n' ) { ps + + ; }
}
2008-08-09 06:45:19 -04:00
assertEquals ( 103 , ps ) ;
2007-12-30 11:53:42 -05:00
}
2008-04-09 08:22:23 -04:00
public void testGetWithHyperlinks ( ) throws Exception {
XWPFWordExtractor extractor =
new XWPFWordExtractor ( xmlC ) ;
extractor . getText ( ) ;
extractor . setFetchHyperlinks ( true ) ;
extractor . getText ( ) ;
// Now check contents
// TODO - fix once correctly handling contents
extractor . setFetchHyperlinks ( false ) ;
assertEquals (
// "This is a test document\nThis bit is in bold and italic\n" +
// "Back to normal\nWe have a hyperlink here, and another.\n",
" This is a test document \ nThis bit is in bold and italic \ n " +
" Back to normal \ nWe have a here, and .hyperlinkanother \ n " ,
extractor . getText ( )
) ;
extractor . setFetchHyperlinks ( true ) ;
assertEquals (
// "This is a test document\nThis bit is in bold and italic\n" +
// "Back to normal\nWe have a hyperlink here, and another.\n",
" This is a test document \ nThis bit is in bold and italic \ n " +
" Back to normal \ nWe have a here, and .hyperlink <http://poi.apache.org/>another \ n " ,
extractor . getText ( )
) ;
}
2007-12-30 11:53:42 -05:00
}