diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java index 070f0c18a..f02982949 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java +++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java @@ -33,6 +33,7 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlink; import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFRelation; +import org.apache.poi.xwpf.usermodel.XWPFRun; import org.apache.poi.xwpf.usermodel.XWPFSDT; import org.apache.poi.xwpf.usermodel.XWPFSDTCell; import org.apache.poi.xwpf.usermodel.XWPFTable; @@ -53,6 +54,7 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { private XWPFDocument document; private boolean fetchHyperlinks = false; + private boolean concatenatePhoneticRuns = true; public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { this(new XWPFDocument(container)); @@ -86,6 +88,14 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { fetchHyperlinks = fetch; } + /** + * Should we concatenate phonetic runs in extraction. Default is true + * @param concatenatePhoneticRuns + */ + public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) { + this.concatenatePhoneticRuns = concatenatePhoneticRuns; + } + public String getText() { StringBuffer text = new StringBuffer(); XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); @@ -130,7 +140,11 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { for (IRunElement run : paragraph.getRuns()) { - text.append(run); + if (! concatenatePhoneticRuns && run instanceof XWPFRun) { + text.append(((XWPFRun)run).text()); + } else { + text.append(run); + } if (run instanceof XWPFHyperlinkRun && fetchHyperlinks) { XWPFHyperlink link = ((XWPFHyperlinkRun) run).getHyperlink(document); if (link != null) diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java index 85289c502..6c16b1618 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java @@ -68,6 +68,8 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRuby; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRubyContent; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedHpsMeasure; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedTwipsMeasure; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; @@ -1042,10 +1044,15 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun { } /** - * Returns the string version of the text + * Returns the string version of the text and the phonetic string */ public String toString() { - return text(); + String phonetic = getPhonetic(); + if (phonetic.length() > 0) { + return text() +" ("+phonetic.toString()+")"; + } else { + return text(); + } } /** @@ -1061,71 +1068,139 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun { c.selectPath("./*"); while (c.toNextSelection()) { XmlObject o = c.getObject(); - if (o instanceof CTText) { + if (o instanceof CTRuby) { + handleRuby(o, text, false); + continue; + } + _getText(o, text); + } + c.dispose(); + return text.toString(); + + } + + /** + * + * @return the phonetic (ruby) string associated with this run or an empty String if none exists + */ + public String getPhonetic() { + StringBuffer text = new StringBuffer(); + + // Grab the text and tabs of the text run + // Do so in a way that preserves the ordering + XmlCursor c = run.newCursor(); + c.selectPath("./*"); + while (c.toNextSelection()) { + XmlObject o = c.getObject(); + if (o instanceof CTRuby) { + handleRuby(o, text, true); + } + } + c.dispose(); + return text.toString(); + } + + /** + * + * @param rubyObj rubyobject + * @param text buffer to which to append the content + * @param extractPhonetic extract the phonetic (rt) component or the base component + */ + private void handleRuby(XmlObject rubyObj, StringBuffer text, boolean extractPhonetic) { + XmlCursor c = rubyObj.newCursor(); + + //according to the spec, a ruby object + //has the phonetic (rt) first, then the actual text (base) + //second. + + c.selectPath(".//*"); + boolean inRT = false; + boolean inBase = false; + while (c.toNextSelection()) { + XmlObject o = c.getObject(); + if (o instanceof CTRubyContent) { String tagName = o.getDomNode().getNodeName(); - // Field Codes (w:instrText, defined in spec sec. 17.16.23) - // come up as instances of CTText, but we don't want them - // in the normal text output - if (!"w:instrText".equals(tagName)) { - text.append(((CTText) o).getStringValue()); + if ("w:rt".equals(tagName)) { + inRT = true; + } else if ("w:rubyBase".equals(tagName)) { + inRT = false; + inBase = true; + } + } else { + if (extractPhonetic && inRT) { + _getText(o, text); + } else if (! extractPhonetic && inBase) { + _getText(o, text); } } + } + c.dispose(); + } - // Complex type evaluation (currently only for extraction of check boxes) - if (o instanceof CTFldChar) { - CTFldChar ctfldChar = ((CTFldChar) o); - if (ctfldChar.getFldCharType() == STFldCharType.BEGIN) { - if (ctfldChar.getFfData() != null) { - for (CTFFCheckBox checkBox : ctfldChar.getFfData().getCheckBoxList()) { - if (checkBox.getDefault() != null && checkBox.getDefault().getVal() == STOnOff.X_1) { - text.append("|X|"); - } else { - text.append("|_|"); - } + private void _getText(XmlObject o, StringBuffer text) { + + if (o instanceof CTText) { + String tagName = o.getDomNode().getNodeName(); + // Field Codes (w:instrText, defined in spec sec. 17.16.23) + // come up as instances of CTText, but we don't want them + // in the normal text output + if (!"w:instrText".equals(tagName)) { + text.append(((CTText) o).getStringValue()); + } + } + + // Complex type evaluation (currently only for extraction of check boxes) + if (o instanceof CTFldChar) { + CTFldChar ctfldChar = ((CTFldChar) o); + if (ctfldChar.getFldCharType() == STFldCharType.BEGIN) { + if (ctfldChar.getFfData() != null) { + for (CTFFCheckBox checkBox : ctfldChar.getFfData().getCheckBoxList()) { + if (checkBox.getDefault() != null && checkBox.getDefault().getVal() == STOnOff.X_1) { + text.append("|X|"); + } else { + text.append("|_|"); } } } } - - if (o instanceof CTPTab) { - text.append("\t"); - } - if (o instanceof CTBr) { - text.append("\n"); - } - if (o instanceof CTEmpty) { - // Some inline text elements get returned not as - // themselves, but as CTEmpty, owing to some odd - // definitions around line 5642 of the XSDs - // This bit works around it, and replicates the above - // rules for that case - String tagName = o.getDomNode().getNodeName(); - if ("w:tab".equals(tagName) || "tab".equals(tagName)) { - text.append("\t"); - } - if ("w:br".equals(tagName) || "br".equals(tagName)) { - text.append("\n"); - } - if ("w:cr".equals(tagName) || "cr".equals(tagName)) { - text.append("\n"); - } - } - if (o instanceof CTFtnEdnRef) { - CTFtnEdnRef ftn = (CTFtnEdnRef) o; - String footnoteRef = ftn.getDomNode().getLocalName().equals("footnoteReference") ? - "[footnoteRef:" + ftn.getId().intValue() + "]" : "[endnoteRef:" + ftn.getId().intValue() + "]"; - text.append(footnoteRef); - } } - c.dispose(); + if (o instanceof CTPTab) { + text.append("\t"); + } + if (o instanceof CTBr) { + text.append("\n"); + } + if (o instanceof CTEmpty) { + // Some inline text elements get returned not as + // themselves, but as CTEmpty, owing to some odd + // definitions around line 5642 of the XSDs + // This bit works around it, and replicates the above + // rules for that case + String tagName = o.getDomNode().getNodeName(); + if ("w:tab".equals(tagName) || "tab".equals(tagName)) { + text.append("\t"); + } + if ("w:br".equals(tagName) || "br".equals(tagName)) { + text.append("\n"); + } + if ("w:cr".equals(tagName) || "cr".equals(tagName)) { + text.append("\n"); + } + } + if (o instanceof CTFtnEdnRef) { + CTFtnEdnRef ftn = (CTFtnEdnRef) o; + String footnoteRef = ftn.getDomNode().getLocalName().equals("footnoteReference") ? + "[footnoteRef:" + ftn.getId().intValue() + "]" : "[endnoteRef:" + ftn.getId().intValue() + "]"; + text.append(footnoteRef); + } + // Any picture text? if (pictureText != null && pictureText.length() > 0) { text.append("\n").append(pictureText); } - return text.toString(); } /** diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index b83b27d73..b7d0b03f6 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -421,4 +421,16 @@ public class TestXWPFWordExtractor extends TestCase { extractor.getText()); extractor.close(); } + + public void testPhonetic() throws IOException { + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("61470.docx"); + XWPFWordExtractor extractor = new XWPFWordExtractor(doc); + //expect: baseText (phoneticText) + assertEquals("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)", extractor.getText().trim()); + extractor.close(); + extractor = new XWPFWordExtractor(doc); + extractor.setConcatenatePhoneticRuns(false); + assertEquals("\u6771\u4EAC", extractor.getText().trim()); + } + } diff --git a/test-data/document/61470.docx b/test-data/document/61470.docx new file mode 100644 index 000000000..6fc1afe45 Binary files /dev/null and b/test-data/document/61470.docx differ