example/crawler.c: simple crawler based on libxml2

Closes #2706
2025-02-28 09:21:50 -05:00 · 2018-07-03 14:32:17 +02:00 · 2018-07-03 14:32:17 +02:00 · 74e0bde773
commit 74e0bde773
parent b0e4598ff7
2 changed files with 211 additions and 1 deletions
--- a/docs/examples/Makefile.inc
+++ b/docs/examples/Makefile.inc
@ -43,4 +43,4 @@ COMPLICATED_EXAMPLES = curlgtk.c curlx.c htmltitle.cpp cacertinmem.c	\
  sampleconv.c synctime.c threaded-ssl.c evhiperfifo.c			\
  smooth-gtk-thread.c version-check.pl href_extractor.c asiohiper.cpp	\
  multi-uv.c xmlstream.c usercertinmem.c sessioninfo.c			\
-  threaded-shared-conn.c
+  threaded-shared-conn.c crawler.c
--- a/docs/examples/crawler.c
+++ b/docs/examples/crawler.c
@ -0,0 +1,210 @@
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Web crawler based on curl and libxml2.
+ * Copyright (C) 2018 Jeroen Ooms <jeroenooms@gmail.com>
+ * License: MIT
+ *
+ * To compile:
+ *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
+ *
+ */
+/* <DESC>
+ * Web crawler based on curl and libxml2 to stress-test curl with
+ * hundreds of concurrent connections to various servers.
+ * </DESC>
+ */
+
+/* Parameters */
+int max_con = 200;
+int max_total = 20000;
+int max_requests = 500;
+int max_link_per_page = 5;
+int follow_relative_links = 0;
+char *start_page = "https://www.reuters.com";
+
+#include <libxml/HTMLparser.h>
+#include <libxml/xpath.h>
+#include <libxml/uri.h>
+#include <curl/curl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <signal.h>
+
+int pending_interrupt = 0;
+void sighandler(int dummy)
+{
+  pending_interrupt = 1;
+}
+
+/* resizable buffer */
+typedef struct {
+  char *buf;
+  size_t size;
+} memory;
+
+size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
+{
+  size_t realsize = sz * nmemb;
+  memory *mem = (memory*) ctx;
+  mem->buf = realloc(mem->buf, mem->size + realsize);
+  memcpy(&(mem->buf[mem->size]), contents, realsize);
+  mem->size += realsize;
+  return realsize;
+}
+
+CURL *make_handle(char *url)
+{
+  CURL *handle = curl_easy_init();
+
+  /* Important: use HTTP2 over HTTPS */
+  curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
+  curl_easy_setopt(handle, CURLOPT_URL, url);
+
+  /* buffer body */
+  memory *mem = malloc(sizeof(memory));
+  mem->size = 0;
+  mem->buf = malloc(1);
+  curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
+  curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
+  curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
+
+  /* For completeness */
+  curl_easy_setopt(handle, CURLOPT_ENCODING, "gzip, deflate");
+  curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
+  curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
+  curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
+  curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
+  curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
+  curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
+  curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
+  curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
+  curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
+  curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
+  curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
+  return handle;
+}
+
+/* HREF finder implemented in libxml2 but could be any HTML parser */
+size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
+{
+  int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
+             HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
+  htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
+  if(!doc)
+    return 0;
+  xmlChar *xpath = (xmlChar*) "//a/@href";
+  xmlXPathContextPtr context = xmlXPathNewContext(doc);
+  xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
+  xmlXPathFreeContext(context);
+  if(!result)
+    return 0;
+  xmlNodeSetPtr nodeset = result->nodesetval;
+  if(xmlXPathNodeSetIsEmpty(nodeset)) {
+    xmlXPathFreeObject(result);
+    return 0;
+  }
+  size_t count = 0;
+  for(int i = 0; i < nodeset->nodeNr; i++) {
+    double r = rand();
+    int x = r * nodeset->nodeNr / RAND_MAX;
+    const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
+    xmlChar *href = xmlNodeListGetString(doc, node, 1);
+    if(follow_relative_links) {
+      xmlChar *orig = href;
+      href = xmlBuildURI(href, (xmlChar *) url);
+      xmlFree(orig);
+    }
+    char *link = (char *) href;
+    if(!link || strlen(link) < 20)
+      continue;
+    if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
+      curl_multi_add_handle(multi_handle, make_handle(link));
+      if(count++ == max_link_per_page)
+        break;
+    }
+    xmlFree(link);
+  }
+  xmlXPathFreeObject(result);
+  return count;
+}
+
+int is_html(char *ctype)
+{
+  return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
+}
+
+int main(void)
+{
+  signal(SIGINT, sighandler);
+  LIBXML_TEST_VERSION;
+  curl_global_init(CURL_GLOBAL_DEFAULT);
+  CURLM *multi_handle = curl_multi_init();
+  curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
+  curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
+
+  /* enables http/2 if available */
+  #ifdef CURLPIPE_MULTIPLEX
+    curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
+  #endif
+
+  /* sets html start page */
+  curl_multi_add_handle(multi_handle, make_handle(start_page));
+
+  int msgs_left;
+  int pending = 0;
+  int complete = 0;
+  int still_running = 1;
+  while(still_running && !pending_interrupt) {
+    int numfds;
+    curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
+    curl_multi_perform(multi_handle, &still_running);
+
+    /* See how the transfers went */
+    CURLMsg *m = NULL;
+    while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
+      if(m->msg == CURLMSG_DONE) {
+        CURL *handle = m->easy_handle;
+        char *url;
+        memory *mem;
+        curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
+        curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
+        if(m->data.result == CURLE_OK) {
+          long res_status;
+          curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
+          if(res_status == 200) {
+            char *ctype;
+            curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
+            printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
+            if(is_html(ctype) && mem->size > 100) {
+              if(pending < max_requests && (complete + pending) < max_total) {
+                pending += follow_links(multi_handle, mem, url);
+                still_running = 1;
+              }
+            }
+          }
+          else {
+            printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
+          }
+        }
+        else {
+          printf("[%d] Connection failure: %s\n", complete, url);
+        }
+        curl_multi_remove_handle(multi_handle, handle);
+        curl_easy_cleanup(handle);
+        free(mem->buf);
+        free(mem);
+        complete++;
+        pending--;
+      }
+    }
+  }
+  curl_multi_cleanup(multi_handle);
+  curl_global_cleanup();
+  return 0;
+}