diff --git a/examples/HtmlParser.cc b/examples/HtmlParser.cc new file mode 100644 index 0000000..f642046 --- /dev/null +++ b/examples/HtmlParser.cc @@ -0,0 +1,166 @@ +/* + * Spdylay - SPDY Library + * + * Copyright (c) 2012 Tatsuhiro Tsujikawa + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#include "HtmlParser.h" + +#include "util.h" +#include "uri.h" + +namespace spdylay { + +ParserData::ParserData(const std::string& base_uri) + : base_uri(base_uri) +{} + +HtmlParser::HtmlParser(const std::string& base_uri) + : base_uri_(base_uri), + parser_ctx_(0), + parser_data_(base_uri) +{} + +HtmlParser::~HtmlParser() +{ + htmlFreeParserCtxt(parser_ctx_); +} + +namespace { +const char* get_attr(const xmlChar **attrs, const char *name) +{ + for(; *attrs; attrs += 2) { + if(util::strieq(reinterpret_cast(attrs[0]), name)) { + return reinterpret_cast(attrs[1]); + } + } + return 0; +} +} // namespace + +namespace { +void start_element_func +(void* user_data, + const xmlChar *name, + const xmlChar **attrs) +{ + ParserData *parser_data = reinterpret_cast(user_data); + if(util::strieq(reinterpret_cast(name), "link")) { + const char *rel_attr = get_attr(attrs, "rel"); + const char *href_attr = get_attr(attrs, "href"); + if((util::strieq(rel_attr, "shortcut icon") || + util::strieq(rel_attr, "stylesheet")) && + href_attr) { + std::string uri = uri::joinUri(parser_data->base_uri, href_attr); + parser_data->links.push_back(uri); + } + } else if(util::strieq(reinterpret_cast(name), "img")) { + const char *src_attr = get_attr(attrs, "src"); + if(src_attr) { + std::string uri = uri::joinUri(parser_data->base_uri, src_attr); + parser_data->links.push_back(uri); + } + } +} +} // namespace + +namespace { +xmlSAXHandler saxHandler = + { + 0, // internalSubsetSAXFunc + 0, // isStandaloneSAXFunc + 0, // hasInternalSubsetSAXFunc + 0, // hasExternalSubsetSAXFunc + 0, // resolveEntitySAXFunc + 0, // getEntitySAXFunc + 0, // entityDeclSAXFunc + 0, // notationDeclSAXFunc + 0, // attributeDeclSAXFunc + 0, // elementDeclSAXFunc + 0, // unparsedEntityDeclSAXFunc + 0, // setDocumentLocatorSAXFunc + 0, // startDocumentSAXFunc + 0, // endDocumentSAXFunc + &start_element_func, // startElementSAXFunc + 0, // endElementSAXFunc + 0, // referenceSAXFunc + 0, // charactersSAXFunc + 0, // ignorableWhitespaceSAXFunc + 0, // processingInstructionSAXFunc + 0, // commentSAXFunc + 0, // warningSAXFunc + 0, // errorSAXFunc + 0, // fatalErrorSAXFunc + 0, // getParameterEntitySAXFunc + 0, // cdataBlockSAXFunc + 0, // externalSubsetSAXFunc + 0, // unsigned int initialized + 0, // void * _private + 0, // startElementNsSAX2Func + 0, // endElementNsSAX2Func + 0, // xmlStructuredErrorFunc + }; +} // namespace + +int HtmlParser::parse_chunk(const char *chunk, size_t size, int fin) +{ + if(!parser_ctx_) { + parser_ctx_ = htmlCreatePushParserCtxt(&saxHandler, + &parser_data_, + chunk, size, + base_uri_.c_str(), + XML_CHAR_ENCODING_NONE); + if(!parser_ctx_) { + return -1; + } else { + if(fin) { + return parse_chunk_internal(0, 0, fin); + } else { + return 0; + } + } + } else { + return parse_chunk_internal(chunk, size, fin); + } +} + +int HtmlParser::parse_chunk_internal(const char *chunk, size_t size, + int fin) +{ + int rv = htmlParseChunk(parser_ctx_, chunk, size, fin); + if(rv == 0) { + return 0; + } else { + return -1; + } +} + +const std::vector& HtmlParser::get_links() const +{ + return parser_data_.links; +} + +void HtmlParser::clear_links() +{ + parser_data_.links.clear(); +} + +} // namespace spdylay diff --git a/examples/HtmlParser.h b/examples/HtmlParser.h new file mode 100644 index 0000000..eeb1a07 --- /dev/null +++ b/examples/HtmlParser.h @@ -0,0 +1,81 @@ +/* + * Spdylay - SPDY Library + * + * Copyright (c) 2012 Tatsuhiro Tsujikawa + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef HTML_PARSER_H +#define HTML_PARSER_H + +#include + +#include +#include + +#ifdef HAVE_LIBXML2 + +#include + +namespace spdylay { + +struct ParserData { + std::string base_uri; + std::vector links; + ParserData(const std::string& base_uri); +}; + +class HtmlParser { +public: + HtmlParser(const std::string& base_uri); + ~HtmlParser(); + int parse_chunk(const char *chunk, size_t size, int fin); + const std::vector& get_links() const; + void clear_links(); +private: + int parse_chunk_internal(const char *chunk, size_t size, int fin); + + std::string base_uri_; + htmlParserCtxtPtr parser_ctx_; + ParserData parser_data_; +}; + +} // namespace spdylay + +#else // !HAVE_LIBXML2 + +namespace spdylay { + +class HtmlParser { +public: + HtmlParser(const std::string& base_uri) {} + ~HtmlParser() {} + int parse_chunk(const char *chunk, size_t size, int fin) { return 0; } + const std::vector& get_links() const { return links_; } + void clear_links() {} +private: + std::vector links_; +}; + +} // namespace spdylay + +#endif // !HAVE_LIBXML2 + +#endif // HTML_PARSER_H