From 74e0bde7733308f7f64b67adfdf081dda7ece2f1 Mon Sep 17 00:00:00 2001 From: Jeroen Ooms Date: Tue, 3 Jul 2018 14:32:17 +0200 Subject: [PATCH] example/crawler.c: simple crawler based on libxml2 Closes #2706 --- docs/examples/Makefile.inc | 2 +- docs/examples/crawler.c | 210 +++++++++++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 docs/examples/crawler.c diff --git a/docs/examples/Makefile.inc b/docs/examples/Makefile.inc index e8aa9fa68..9215b82fb 100644 --- a/docs/examples/Makefile.inc +++ b/docs/examples/Makefile.inc @@ -43,4 +43,4 @@ COMPLICATED_EXAMPLES = curlgtk.c curlx.c htmltitle.cpp cacertinmem.c \ sampleconv.c synctime.c threaded-ssl.c evhiperfifo.c \ smooth-gtk-thread.c version-check.pl href_extractor.c asiohiper.cpp \ multi-uv.c xmlstream.c usercertinmem.c sessioninfo.c \ - threaded-shared-conn.c + threaded-shared-conn.c crawler.c diff --git a/docs/examples/crawler.c b/docs/examples/crawler.c new file mode 100644 index 000000000..47c427473 --- /dev/null +++ b/docs/examples/crawler.c @@ -0,0 +1,210 @@ +/*************************************************************************** + * _ _ ____ _ + * Project ___| | | | _ \| | + * / __| | | | |_) | | + * | (__| |_| | _ <| |___ + * \___|\___/|_| \_\_____| + * + * Web crawler based on curl and libxml2. + * Copyright (C) 2018 Jeroen Ooms + * License: MIT + * + * To compile: + * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl) + * + */ +/* + * Web crawler based on curl and libxml2 to stress-test curl with + * hundreds of concurrent connections to various servers. + * + */ + +/* Parameters */ +int max_con = 200; +int max_total = 20000; +int max_requests = 500; +int max_link_per_page = 5; +int follow_relative_links = 0; +char *start_page = "https://www.reuters.com"; + +#include +#include +#include +#include +#include +#include +#include +#include + +int pending_interrupt = 0; +void sighandler(int dummy) +{ + pending_interrupt = 1; +} + +/* resizable buffer */ +typedef struct { + char *buf; + size_t size; +} memory; + +size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx) +{ + size_t realsize = sz * nmemb; + memory *mem = (memory*) ctx; + mem->buf = realloc(mem->buf, mem->size + realsize); + memcpy(&(mem->buf[mem->size]), contents, realsize); + mem->size += realsize; + return realsize; +} + +CURL *make_handle(char *url) +{ + CURL *handle = curl_easy_init(); + + /* Important: use HTTP2 over HTTPS */ + curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS); + curl_easy_setopt(handle, CURLOPT_URL, url); + + /* buffer body */ + memory *mem = malloc(sizeof(memory)); + mem->size = 0; + mem->buf = malloc(1); + curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer); + curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem); + curl_easy_setopt(handle, CURLOPT_PRIVATE, mem); + + /* For completeness */ + curl_easy_setopt(handle, CURLOPT_ENCODING, "gzip, deflate"); + curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L); + curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L); + curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L); + curl_easy_setopt(handle, CURLOPT_COOKIEFILE, ""); + curl_easy_setopt(handle, CURLOPT_FILETIME, 1L); + curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler"); + curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY); + curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L); + curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY); + curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L); + return handle; +} + +/* HREF finder implemented in libxml2 but could be any HTML parser */ +size_t follow_links(CURLM *multi_handle, memory *mem, char *url) +{ + int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \ + HTML_PARSE_NOWARNING | HTML_PARSE_NONET; + htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts); + if(!doc) + return 0; + xmlChar *xpath = (xmlChar*) "//a/@href"; + xmlXPathContextPtr context = xmlXPathNewContext(doc); + xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context); + xmlXPathFreeContext(context); + if(!result) + return 0; + xmlNodeSetPtr nodeset = result->nodesetval; + if(xmlXPathNodeSetIsEmpty(nodeset)) { + xmlXPathFreeObject(result); + return 0; + } + size_t count = 0; + for(int i = 0; i < nodeset->nodeNr; i++) { + double r = rand(); + int x = r * nodeset->nodeNr / RAND_MAX; + const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode; + xmlChar *href = xmlNodeListGetString(doc, node, 1); + if(follow_relative_links) { + xmlChar *orig = href; + href = xmlBuildURI(href, (xmlChar *) url); + xmlFree(orig); + } + char *link = (char *) href; + if(!link || strlen(link) < 20) + continue; + if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) { + curl_multi_add_handle(multi_handle, make_handle(link)); + if(count++ == max_link_per_page) + break; + } + xmlFree(link); + } + xmlXPathFreeObject(result); + return count; +} + +int is_html(char *ctype) +{ + return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html"); +} + +int main(void) +{ + signal(SIGINT, sighandler); + LIBXML_TEST_VERSION; + curl_global_init(CURL_GLOBAL_DEFAULT); + CURLM *multi_handle = curl_multi_init(); + curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con); + curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L); + + /* enables http/2 if available */ + #ifdef CURLPIPE_MULTIPLEX + curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX); + #endif + + /* sets html start page */ + curl_multi_add_handle(multi_handle, make_handle(start_page)); + + int msgs_left; + int pending = 0; + int complete = 0; + int still_running = 1; + while(still_running && !pending_interrupt) { + int numfds; + curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); + curl_multi_perform(multi_handle, &still_running); + + /* See how the transfers went */ + CURLMsg *m = NULL; + while((m = curl_multi_info_read(multi_handle, &msgs_left))) { + if(m->msg == CURLMSG_DONE) { + CURL *handle = m->easy_handle; + char *url; + memory *mem; + curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem); + curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url); + if(m->data.result == CURLE_OK) { + long res_status; + curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status); + if(res_status == 200) { + char *ctype; + curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype); + printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url); + if(is_html(ctype) && mem->size > 100) { + if(pending < max_requests && (complete + pending) < max_total) { + pending += follow_links(multi_handle, mem, url); + still_running = 1; + } + } + } + else { + printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url); + } + } + else { + printf("[%d] Connection failure: %s\n", complete, url); + } + curl_multi_remove_handle(multi_handle, handle); + curl_easy_cleanup(handle); + free(mem->buf); + free(mem); + complete++; + pending--; + } + } + } + curl_multi_cleanup(multi_handle); + curl_global_cleanup(); + return 0; +}