2018-07-03 08:32:17 -04:00
|
|
|
/***************************************************************************
|
|
|
|
* _ _ ____ _
|
|
|
|
* Project ___| | | | _ \| |
|
|
|
|
* / __| | | | |_) | |
|
|
|
|
* | (__| |_| | _ <| |___
|
|
|
|
* \___|\___/|_| \_\_____|
|
|
|
|
*
|
|
|
|
* Web crawler based on curl and libxml2.
|
|
|
|
* Copyright (C) 2018 Jeroen Ooms <jeroenooms@gmail.com>
|
|
|
|
* License: MIT
|
|
|
|
*
|
|
|
|
* To compile:
|
|
|
|
* gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
/* <DESC>
|
|
|
|
* Web crawler based on curl and libxml2 to stress-test curl with
|
|
|
|
* hundreds of concurrent connections to various servers.
|
|
|
|
* </DESC>
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Parameters */
|
|
|
|
int max_con = 200;
|
|
|
|
int max_total = 20000;
|
|
|
|
int max_requests = 500;
|
|
|
|
int max_link_per_page = 5;
|
|
|
|
int follow_relative_links = 0;
|
|
|
|
char *start_page = "https://www.reuters.com";
|
|
|
|
|
|
|
|
#include <libxml/HTMLparser.h>
|
|
|
|
#include <libxml/xpath.h>
|
|
|
|
#include <libxml/uri.h>
|
|
|
|
#include <curl/curl.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <math.h>
|
|
|
|
#include <signal.h>
|
|
|
|
|
|
|
|
int pending_interrupt = 0;
|
|
|
|
void sighandler(int dummy)
|
|
|
|
{
|
|
|
|
pending_interrupt = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* resizable buffer */
|
|
|
|
typedef struct {
|
|
|
|
char *buf;
|
|
|
|
size_t size;
|
|
|
|
} memory;
|
|
|
|
|
|
|
|
size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
|
|
|
|
{
|
|
|
|
size_t realsize = sz * nmemb;
|
|
|
|
memory *mem = (memory*) ctx;
|
|
|
|
mem->buf = realloc(mem->buf, mem->size + realsize);
|
|
|
|
memcpy(&(mem->buf[mem->size]), contents, realsize);
|
|
|
|
mem->size += realsize;
|
|
|
|
return realsize;
|
|
|
|
}
|
|
|
|
|
|
|
|
CURL *make_handle(char *url)
|
|
|
|
{
|
|
|
|
CURL *handle = curl_easy_init();
|
|
|
|
|
|
|
|
/* Important: use HTTP2 over HTTPS */
|
|
|
|
curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_URL, url);
|
|
|
|
|
|
|
|
/* buffer body */
|
|
|
|
memory *mem = malloc(sizeof(memory));
|
|
|
|
mem->size = 0;
|
|
|
|
mem->buf = malloc(1);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
|
|
|
|
|
|
|
|
/* For completeness */
|
|
|
|
curl_easy_setopt(handle, CURLOPT_ENCODING, "gzip, deflate");
|
|
|
|
curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
|
|
|
|
curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
|
|
|
|
curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
|
|
|
|
return handle;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* HREF finder implemented in libxml2 but could be any HTML parser */
|
|
|
|
size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
|
|
|
|
{
|
|
|
|
int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
|
|
|
|
HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
|
|
|
|
htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
|
|
|
|
if(!doc)
|
|
|
|
return 0;
|
|
|
|
xmlChar *xpath = (xmlChar*) "//a/@href";
|
|
|
|
xmlXPathContextPtr context = xmlXPathNewContext(doc);
|
|
|
|
xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
|
|
|
|
xmlXPathFreeContext(context);
|
|
|
|
if(!result)
|
|
|
|
return 0;
|
|
|
|
xmlNodeSetPtr nodeset = result->nodesetval;
|
|
|
|
if(xmlXPathNodeSetIsEmpty(nodeset)) {
|
|
|
|
xmlXPathFreeObject(result);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
size_t count = 0;
|
|
|
|
for(int i = 0; i < nodeset->nodeNr; i++) {
|
|
|
|
double r = rand();
|
|
|
|
int x = r * nodeset->nodeNr / RAND_MAX;
|
|
|
|
const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
|
|
|
|
xmlChar *href = xmlNodeListGetString(doc, node, 1);
|
|
|
|
if(follow_relative_links) {
|
|
|
|
xmlChar *orig = href;
|
|
|
|
href = xmlBuildURI(href, (xmlChar *) url);
|
|
|
|
xmlFree(orig);
|
|
|
|
}
|
|
|
|
char *link = (char *) href;
|
|
|
|
if(!link || strlen(link) < 20)
|
|
|
|
continue;
|
|
|
|
if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
|
|
|
|
curl_multi_add_handle(multi_handle, make_handle(link));
|
|
|
|
if(count++ == max_link_per_page)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
xmlFree(link);
|
|
|
|
}
|
|
|
|
xmlXPathFreeObject(result);
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
int is_html(char *ctype)
|
|
|
|
{
|
|
|
|
return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
|
|
|
|
}
|
|
|
|
|
|
|
|
int main(void)
|
|
|
|
{
|
|
|
|
signal(SIGINT, sighandler);
|
|
|
|
LIBXML_TEST_VERSION;
|
|
|
|
curl_global_init(CURL_GLOBAL_DEFAULT);
|
|
|
|
CURLM *multi_handle = curl_multi_init();
|
|
|
|
curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
|
|
|
|
curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
|
|
|
|
|
|
|
|
/* enables http/2 if available */
|
2018-07-11 05:47:21 -04:00
|
|
|
#ifdef CURLPIPE_MULTIPLEX
|
|
|
|
curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
|
|
|
|
#endif
|
2018-07-03 08:32:17 -04:00
|
|
|
|
|
|
|
/* sets html start page */
|
|
|
|
curl_multi_add_handle(multi_handle, make_handle(start_page));
|
|
|
|
|
|
|
|
int msgs_left;
|
|
|
|
int pending = 0;
|
|
|
|
int complete = 0;
|
|
|
|
int still_running = 1;
|
|
|
|
while(still_running && !pending_interrupt) {
|
|
|
|
int numfds;
|
|
|
|
curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
|
|
|
|
curl_multi_perform(multi_handle, &still_running);
|
|
|
|
|
|
|
|
/* See how the transfers went */
|
|
|
|
CURLMsg *m = NULL;
|
|
|
|
while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
|
|
|
|
if(m->msg == CURLMSG_DONE) {
|
|
|
|
CURL *handle = m->easy_handle;
|
|
|
|
char *url;
|
|
|
|
memory *mem;
|
|
|
|
curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
|
|
|
|
curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
|
|
|
|
if(m->data.result == CURLE_OK) {
|
|
|
|
long res_status;
|
|
|
|
curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
|
|
|
|
if(res_status == 200) {
|
|
|
|
char *ctype;
|
|
|
|
curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
|
|
|
|
printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
|
|
|
|
if(is_html(ctype) && mem->size > 100) {
|
|
|
|
if(pending < max_requests && (complete + pending) < max_total) {
|
|
|
|
pending += follow_links(multi_handle, mem, url);
|
|
|
|
still_running = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
printf("[%d] Connection failure: %s\n", complete, url);
|
|
|
|
}
|
|
|
|
curl_multi_remove_handle(multi_handle, handle);
|
|
|
|
curl_easy_cleanup(handle);
|
|
|
|
free(mem->buf);
|
|
|
|
free(mem);
|
|
|
|
complete++;
|
|
|
|
pending--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
curl_multi_cleanup(multi_handle);
|
|
|
|
curl_global_cleanup();
|
|
|
|
return 0;
|
|
|
|
}
|