curl/docs/examples/crawler.c

/***************************************************************************
 *                                  _   _ ____  _
 *  Project                     ___| | | |  _ \| |
 *                             / __| | | | |_) | |
 *                            | (__| |_| |  _ <| |___
 *                             \___|\___/|_| \_\_____|
 *
 * Web crawler based on curl and libxml2.
 * Copyright (C) 2018 Jeroen Ooms <jeroenooms@gmail.com>
 * License: MIT
 *
 * To compile:
 *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
 *
 */
/* <DESC>
 * Web crawler based on curl and libxml2 to stress-test curl with
 * hundreds of concurrent connections to various servers.
 * </DESC>
 */

/* Parameters */
int max_con = 200;
int max_total = 20000;
int max_requests = 500;
int max_link_per_page = 5;
int follow_relative_links = 0;
char *start_page = "https://www.reuters.com";

#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/uri.h>
#include <curl/curl.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <signal.h>

int pending_interrupt = 0;
void sighandler(int dummy)
{
  pending_interrupt = 1;
}

/* resizable buffer */
typedef struct {
  char *buf;
  size_t size;
} memory;

size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
{
  size_t realsize = sz * nmemb;
  memory *mem = (memory*) ctx;
  mem->buf = realloc(mem->buf, mem->size + realsize);
  memcpy(&(mem->buf[mem->size]), contents, realsize);
  mem->size += realsize;
  return realsize;
}

CURL *make_handle(char *url)
{
  CURL *handle = curl_easy_init();

  /* Important: use HTTP2 over HTTPS */
  curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
  curl_easy_setopt(handle, CURLOPT_URL, url);

  /* buffer body */
  memory *mem = malloc(sizeof(memory));
  mem->size = 0;
  mem->buf = malloc(1);
  curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
  curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
  curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);

  /* For completeness */
  curl_easy_setopt(handle, CURLOPT_ENCODING, "gzip, deflate");
  curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
  curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
  curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
  curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
  curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
  curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
  curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
  curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
  curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
  curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
  curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
  return handle;
}

/* HREF finder implemented in libxml2 but could be any HTML parser */
size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
{
  int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
             HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
  htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
  if(!doc)
    return 0;
  xmlChar *xpath = (xmlChar*) "//a/@href";
  xmlXPathContextPtr context = xmlXPathNewContext(doc);
  xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
  xmlXPathFreeContext(context);
  if(!result)
    return 0;
  xmlNodeSetPtr nodeset = result->nodesetval;
  if(xmlXPathNodeSetIsEmpty(nodeset)) {
    xmlXPathFreeObject(result);
    return 0;
  }
  size_t count = 0;
  for(int i = 0; i < nodeset->nodeNr; i++) {
    double r = rand();
    int x = r * nodeset->nodeNr / RAND_MAX;
    const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
    xmlChar *href = xmlNodeListGetString(doc, node, 1);
    if(follow_relative_links) {
      xmlChar *orig = href;
      href = xmlBuildURI(href, (xmlChar *) url);
      xmlFree(orig);
    }
    char *link = (char *) href;
    if(!link || strlen(link) < 20)
      continue;
    if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
      curl_multi_add_handle(multi_handle, make_handle(link));
      if(count++ == max_link_per_page)
        break;
    }
    xmlFree(link);
  }
  xmlXPathFreeObject(result);
  return count;
}

int is_html(char *ctype)
{
  return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
}

int main(void)
{
  signal(SIGINT, sighandler);
  LIBXML_TEST_VERSION;
  curl_global_init(CURL_GLOBAL_DEFAULT);
  CURLM *multi_handle = curl_multi_init();
  curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
  curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);

  /* enables http/2 if available */
#ifdef CURLPIPE_MULTIPLEX
  curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
#endif

  /* sets html start page */
  curl_multi_add_handle(multi_handle, make_handle(start_page));

  int msgs_left;
  int pending = 0;
  int complete = 0;
  int still_running = 1;
  while(still_running && !pending_interrupt) {
    int numfds;
    curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
    curl_multi_perform(multi_handle, &still_running);

    /* See how the transfers went */
    CURLMsg *m = NULL;
    while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
      if(m->msg == CURLMSG_DONE) {
        CURL *handle = m->easy_handle;
        char *url;
        memory *mem;
        curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
        curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
        if(m->data.result == CURLE_OK) {
          long res_status;
          curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
          if(res_status == 200) {
            char *ctype;
            curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
            printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
            if(is_html(ctype) && mem->size > 100) {
              if(pending < max_requests && (complete + pending) < max_total) {
                pending += follow_links(multi_handle, mem, url);
                still_running = 1;
              }
            }
          }
          else {
            printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
          }
        }
        else {
          printf("[%d] Connection failure: %s\n", complete, url);
        }
        curl_multi_remove_handle(multi_handle, handle);
        curl_easy_cleanup(handle);
        free(mem->buf);
        free(mem);
        complete++;
        pending--;
      }
    }
  }
  curl_multi_cleanup(multi_handle);
  curl_global_cleanup();
  return 0;
}
example/crawler.c: simple crawler based on libxml2 Closes #2706 2018-07-03 08:32:17 -04:00			`/***************************************************************************`
			`* _ _ ____ _`
			`* Project ___\| \| \| \| _ \\| \|`
			`* / __\| \| \| \| \|_) \| \|`
			`* \| (__\| \|_\| \| _ <\| \|___`
			`* \___\|\___/\|_\| \_\_____\|`
			`*`
			`* Web crawler based on curl and libxml2.`
			`* Copyright (C) 2018 Jeroen Ooms <jeroenooms@gmail.com>`
			`* License: MIT`
			`*`
			`* To compile:`
			`* gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)`
			`*`
			`*/`
			`/* <DESC>`
			`* Web crawler based on curl and libxml2 to stress-test curl with`
			`* hundreds of concurrent connections to various servers.`
			`* </DESC>`
			`*/`

			`/* Parameters */`
			`int max_con = 200;`
			`int max_total = 20000;`
			`int max_requests = 500;`
			`int max_link_per_page = 5;`
			`int follow_relative_links = 0;`
			`char *start_page = "https://www.reuters.com";`

			`#include <libxml/HTMLparser.h>`
			`#include <libxml/xpath.h>`
			`#include <libxml/uri.h>`
			`#include <curl/curl.h>`
			`#include <stdlib.h>`
			`#include <string.h>`
			`#include <math.h>`
			`#include <signal.h>`

			`int pending_interrupt = 0;`
			`void sighandler(int dummy)`
			`{`
			`pending_interrupt = 1;`
			`}`

			`/* resizable buffer */`
			`typedef struct {`
			`char *buf;`
			`size_t size;`
			`} memory;`

			`size_t grow_buffer(void contents, size_t sz, size_t nmemb, void ctx)`
			`{`
			`size_t realsize = sz * nmemb;`
			`memory mem = (memory) ctx;`
			`mem->buf = realloc(mem->buf, mem->size + realsize);`
			`memcpy(&(mem->buf[mem->size]), contents, realsize);`
			`mem->size += realsize;`
			`return realsize;`
			`}`

			`CURL make_handle(char url)`
			`{`
			`CURL *handle = curl_easy_init();`

			`/* Important: use HTTP2 over HTTPS */`
			`curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);`
			`curl_easy_setopt(handle, CURLOPT_URL, url);`

			`/* buffer body */`
			`memory *mem = malloc(sizeof(memory));`
			`mem->size = 0;`
			`mem->buf = malloc(1);`
			`curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);`
			`curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);`
			`curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);`

			`/* For completeness */`
			`curl_easy_setopt(handle, CURLOPT_ENCODING, "gzip, deflate");`
			`curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);`
			`curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);`
			`curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);`
			`curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);`
			`curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");`
			`curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);`
			`curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");`
			`curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);`
			`curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);`
			`curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);`
			`curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);`
			`return handle;`
			`}`

			`/* HREF finder implemented in libxml2 but could be any HTML parser */`
			`size_t follow_links(CURLM multi_handle, memory mem, char *url)`
			`{`
			`int opts = HTML_PARSE_NOBLANKS \| HTML_PARSE_NOERROR \| \`
			`HTML_PARSE_NOWARNING \| HTML_PARSE_NONET;`
			`htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);`
			`if(!doc)`
			`return 0;`
			`xmlChar xpath = (xmlChar) "//a/@href";`
			`xmlXPathContextPtr context = xmlXPathNewContext(doc);`
			`xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);`
			`xmlXPathFreeContext(context);`
			`if(!result)`
			`return 0;`
			`xmlNodeSetPtr nodeset = result->nodesetval;`
			`if(xmlXPathNodeSetIsEmpty(nodeset)) {`
			`xmlXPathFreeObject(result);`
			`return 0;`
			`}`
			`size_t count = 0;`
			`for(int i = 0; i < nodeset->nodeNr; i++) {`
			`double r = rand();`
			`int x = r * nodeset->nodeNr / RAND_MAX;`
			`const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;`
			`xmlChar *href = xmlNodeListGetString(doc, node, 1);`
			`if(follow_relative_links) {`
			`xmlChar *orig = href;`
			`href = xmlBuildURI(href, (xmlChar *) url);`
			`xmlFree(orig);`
			`}`
			`char link = (char ) href;`
			`if(!link \|\| strlen(link) < 20)`
			`continue;`
			`if(!strncmp(link, "http://", 7) \|\| !strncmp(link, "https://", 8)) {`
			`curl_multi_add_handle(multi_handle, make_handle(link));`
			`if(count++ == max_link_per_page)`
			`break;`
			`}`
			`xmlFree(link);`
			`}`
			`xmlXPathFreeObject(result);`
			`return count;`
			`}`

			`int is_html(char *ctype)`
			`{`
			`return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");`
			`}`

			`int main(void)`
			`{`
			`signal(SIGINT, sighandler);`
			`LIBXML_TEST_VERSION;`
			`curl_global_init(CURL_GLOBAL_DEFAULT);`
			`CURLM *multi_handle = curl_multi_init();`
			`curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);`
			`curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);`

			`/* enables http/2 if available */`
examples/crawler.c: move #ifdef to column 0 Apparently the C => HTML converter on the web site doesn't quite like it otherwise. Reported-by: Jeroen Ooms 2018-07-11 05:47:21 -04:00			`#ifdef CURLPIPE_MULTIPLEX`
			`curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);`
			`#endif`
example/crawler.c: simple crawler based on libxml2 Closes #2706 2018-07-03 08:32:17 -04:00
			`/* sets html start page */`
			`curl_multi_add_handle(multi_handle, make_handle(start_page));`

			`int msgs_left;`
			`int pending = 0;`
			`int complete = 0;`
			`int still_running = 1;`
			`while(still_running && !pending_interrupt) {`
			`int numfds;`
			`curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);`
			`curl_multi_perform(multi_handle, &still_running);`

			`/* See how the transfers went */`
			`CURLMsg *m = NULL;`
			`while((m = curl_multi_info_read(multi_handle, &msgs_left))) {`
			`if(m->msg == CURLMSG_DONE) {`
			`CURL *handle = m->easy_handle;`
			`char *url;`
			`memory *mem;`
			`curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);`
			`curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);`
			`if(m->data.result == CURLE_OK) {`
			`long res_status;`
			`curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);`
			`if(res_status == 200) {`
			`char *ctype;`
			`curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);`
			`printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);`
			`if(is_html(ctype) && mem->size > 100) {`
			`if(pending < max_requests && (complete + pending) < max_total) {`
			`pending += follow_links(multi_handle, mem, url);`
			`still_running = 1;`
			`}`
			`}`
			`}`
			`else {`
			`printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);`
			`}`
			`}`
			`else {`
			`printf("[%d] Connection failure: %s\n", complete, url);`
			`}`
			`curl_multi_remove_handle(multi_handle, handle);`
			`curl_easy_cleanup(handle);`
			`free(mem->buf);`
			`free(mem);`
			`complete++;`
			`pending--;`
			`}`
			`}`
			`}`
			`curl_multi_cleanup(multi_handle);`
			`curl_global_cleanup();`
			`return 0;`
			`}`