2005-05-28 01:36:00 -04:00
/ * = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2006-12-22 14:18:16 -05:00
Licensed to the Apache Software Foundation ( ASF ) under one or more
contributor license agreements . See the NOTICE file distributed with
this work for additional information regarding copyright ownership .
The ASF licenses this file to You under the Apache License , Version 2 . 0
( the " License " ) ; you may not use this file except in compliance with
the License . You may obtain a copy of the License at
2005-05-28 01:36:00 -04:00
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an " AS IS " BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = * /
package org.apache.poi.hslf.extractor ;
import junit.framework.TestCase ;
/ * *
* Tests that the extractor correctly gets the text out of our sample file
*
* @author Nick Burch ( nick at torchbox dot com )
* /
public class TextExtractor extends TestCase {
2007-03-29 11:34:25 -04:00
/** Extractor primed on the 2 page basic test data */
2005-05-28 01:36:00 -04:00
private PowerPointExtractor ppe ;
2007-03-29 11:34:25 -04:00
/** Extractor primed on the 1 page but text-box'd test data */
2005-11-13 12:01:54 -05:00
private PowerPointExtractor ppe2 ;
2007-03-29 11:34:25 -04:00
/** Where to go looking for our test files */
private String dirname ;
2005-05-28 01:36:00 -04:00
public TextExtractor ( ) throws Exception {
2007-03-29 11:34:25 -04:00
dirname = System . getProperty ( " HSLF.testdata.path " ) ;
2005-05-28 01:36:00 -04:00
String filename = dirname + " /basic_test_ppt_file.ppt " ;
ppe = new PowerPointExtractor ( filename ) ;
2005-11-13 12:01:54 -05:00
String filename2 = dirname + " /with_textbox.ppt " ;
ppe2 = new PowerPointExtractor ( filename2 ) ;
2005-05-28 01:36:00 -04:00
}
public void testReadSheetText ( ) throws Exception {
2005-11-13 12:01:54 -05:00
// Basic 2 page example
2005-05-28 01:36:00 -04:00
String sheetText = ppe . getText ( ) ;
String expectText = " This is a test title \ nThis is a test subtitle \ nThis is on page 1 \ nThis is the title on page 2 \ nThis is page two \ nIt has several blocks of text \ nNone of them have formatting \ n " ;
2005-11-13 12:01:54 -05:00
ensureTwoStringsTheSame ( expectText , sheetText ) ;
// 1 page example with text boxes
sheetText = ppe2 . getText ( ) ;
expectText = " Hello, World!!! \ nI am just a poor boy \ nThis is Times New Roman \ nPlain Text \ n " ;
2005-05-28 01:36:00 -04:00
2005-11-13 12:01:54 -05:00
ensureTwoStringsTheSame ( expectText , sheetText ) ;
}
2005-05-28 01:36:00 -04:00
public void testReadNoteText ( ) throws Exception {
2005-11-13 12:01:54 -05:00
// Basic 2 page example
2005-05-28 01:36:00 -04:00
String notesText = ppe . getNotes ( ) ;
String expectText = " These are the notes for page 1 \ nThese are the notes on page two, again lacking formatting \ n " ;
2005-11-13 12:01:54 -05:00
ensureTwoStringsTheSame ( expectText , notesText ) ;
// Other one doesn't have notes
notesText = ppe2 . getNotes ( ) ;
expectText = " " ;
ensureTwoStringsTheSame ( expectText , notesText ) ;
2005-05-28 01:36:00 -04:00
}
2007-03-29 11:34:25 -04:00
/ * *
* Test that when presented with a PPT file missing the odd
* core record , we can still get the rest of the text out
* @throws Exception
* /
public void testMissingCoreRecords ( ) throws Exception {
String filename = dirname + " /missing_core_records.ppt " ;
ppe = new PowerPointExtractor ( filename ) ;
String text = ppe . getText ( true , false ) ;
String nText = ppe . getNotes ( ) ;
assertNotNull ( text ) ;
assertNotNull ( nText ) ;
// Notes record were corrupt, so don't expect any
assertEquals ( nText . length ( ) , 0 ) ;
// Slide records were fine
assertTrue ( text . startsWith ( " Using Disease Surveillance and Response " ) ) ;
}
2005-11-13 12:01:54 -05:00
private void ensureTwoStringsTheSame ( String exp , String act ) throws Exception {
assertEquals ( exp . length ( ) , act . length ( ) ) ;
char [ ] expC = exp . toCharArray ( ) ;
char [ ] actC = act . toCharArray ( ) ;
for ( int i = 0 ; i < expC . length ; i + + ) {
System . out . println ( i + " \ t " + expC [ i ] + " " + actC [ i ] ) ;
assertEquals ( expC [ i ] , actC [ i ] ) ;
}
assertEquals ( exp , act ) ;
}
2005-05-28 01:36:00 -04:00
}