diff --git a/bootstrap.conf b/bootstrap.conf index 77230dbb..6473cbba 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -28,6 +28,7 @@ gnulib_modules=" accept alloca announce-gen +base32 bind c-ctype clock-time @@ -49,6 +50,7 @@ maintainer-makefile mbtowc mkdir crypto/md5 +crypto/sha1 pipe quote quotearg @@ -63,6 +65,7 @@ socket stdbool strcasestr strerror_r-posix +tmpdir unlocked-io update-copyright vasprintf diff --git a/configure.ac b/configure.ac index 76c6fa28..360f6c91 100644 --- a/configure.ac +++ b/configure.ac @@ -511,7 +511,19 @@ if test "X$iri" != "Xno"; then fi fi +dnl +dnl Check for UUID +dnl +AC_CHECK_HEADER(uuid/uuid.h, + AC_CHECK_LIB(uuid, uuid_generate, + [LIBS="${LIBS} -luuid" + AC_DEFINE([HAVE_LIBUUID], 1, + [Define if libuuid is available.]) + ]) +) + + dnl Needed by src/Makefile.am AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"]) diff --git a/src/ChangeLog b/src/ChangeLog index c2af118e..65c48072 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,6 @@ +2011-11-04 Giuseppe Scrivano + + 2011-10-07 Steven Schweda * connect.c: Add HAVE_SYS_SELECT_H and HAVE_SYS_SOCKET_H conditions @@ -21,7 +24,10 @@ * openssl.c (ssl_init): Add type cast (SSL_METHOD *) to newly "const" "meth" argument to accommodate OpenSSL version 0.9.8, where that argument is not "const" in the OpenSSL function (SSL_CTX_new). + * test.c: Declare "program_argstring". * utils.c (fopen_excl): Comment typography. + * warc.h: New file. + * warc.c: New file. 2011-10-02 Henrik Holst (tiny change) * http.c (gethttp): If 'contentonerror' is used then do not diff --git a/src/Makefile.am b/src/Makefile.am index 6b951988..8ef931a6 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -46,13 +46,13 @@ wget_SOURCES = cmpt.c connect.c convert.c cookies.c ftp.c \ css_.c css-url.c \ ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \ http.c init.c log.c main.c netrc.c progress.c ptimer.c \ - recur.c res.c retr.c spider.c url.c \ + recur.c res.c retr.c spider.c url.c warc.c \ utils.c exits.c build_info.c $(IRI_OBJ) \ css-url.h css-tokens.h connect.h convert.h cookies.h \ ftp.h hash.h host.h html-parse.h html-url.h \ http.h http-ntlm.h init.h log.h mswindows.h netrc.h \ options.h progress.h ptimer.h recur.h res.h retr.h \ - spider.h ssl.h sysdep.h url.h utils.h wget.h iri.h \ + spider.h ssl.h sysdep.h url.h warc.h utils.h wget.h iri.h \ exits.h gettext.h nodist_wget_SOURCES = version.c EXTRA_wget_SOURCES = iri.c diff --git a/src/ftp.c b/src/ftp.c index f75397d0..989a1dda 100644 --- a/src/ftp.c +++ b/src/ftp.c @@ -49,6 +49,7 @@ as that of the covered work. */ #include "netrc.h" #include "convert.h" /* for downloaded_file */ #include "recur.h" /* for INFINITE_RECURSION */ +#include "warc.h" #ifdef __VMS # include "vms.h" @@ -237,10 +238,11 @@ static uerr_t ftp_get_listing (struct url *, ccon *, struct fileinfo **); /* Retrieves a file with denoted parameters through opening an FTP connection to the server. It always closes the data connection, - and closes the control connection in case of error. */ + and closes the control connection in case of error. If warc_tmp + is non-NULL, the downloaded data will be written there as well. */ static uerr_t getftp (struct url *u, wgint passed_expected_bytes, wgint *qtyread, - wgint restval, ccon *con, int count) + wgint restval, ccon *con, int count, FILE *warc_tmp) { int csock, dtsock, local_sock, res; uerr_t err = RETROK; /* appease the compiler */ @@ -1155,7 +1157,7 @@ Error in server response, closing control connection.\n")); /* 2011-09-30 SMS. Added listing files to the set of non-"binary" (text, Stream_LF) files. (Wget works either way, but other programs, like, say, text - editors, work better on listing files which have text attributes.) + editors, work better on listing files which have text attributes.) Now we use "binary" attributes for a binary ("IMAGE") transfer, unless "--ftp-stmlf" was specified, and we always use non-"binary" (text, Stream_LF) attributes for a listing file, or for an ASCII @@ -1194,7 +1196,7 @@ Error in server response, closing control connection.\n")); } else if (opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct || opt.output_document || count > 0) - { + { if (opt.unlink && file_exists_p (con->target)) { int res = unlink (con->target); @@ -1274,7 +1276,7 @@ Error in server response, closing control connection.\n")); rd_size = 0; res = fd_read_body (dtsock, fp, expected_bytes ? expected_bytes - restval : 0, - restval, &rd_size, qtyread, &con->dltime, flags); + restval, &rd_size, qtyread, &con->dltime, flags, warc_tmp); tms = datetime_str (time (NULL)); tmrate = retr_rate (rd_size, con->dltime); @@ -1285,15 +1287,18 @@ Error in server response, closing control connection.\n")); if (!output_stream || con->cmd & DO_LIST) fclose (fp); - /* If fd_read_body couldn't write to fp, bail out. */ - if (res == -2) + /* If fd_read_body couldn't write to fp or warc_tmp, bail out. */ + if (res == -2 || (warc_tmp != NULL && res == -3)) { logprintf (LOG_NOTQUIET, _("%s: %s, closing control connection.\n"), con->target, strerror (errno)); fd_close (csock); con->csock = -1; fd_close (dtsock); - return FWRITEERR; + if (res == -2) + return FWRITEERR; + else if (res == -3) + return WARC_TMP_FWRITEERR; } else if (res == -1) { @@ -1409,6 +1414,11 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi uerr_t err; struct_stat st; + /* Declare WARC variables. */ + bool warc_enabled = (opt.warc_filename != NULL); + FILE *warc_tmp = NULL; + ip_address *warc_ip = NULL; + /* Get the target, and set the name for the message accordingly. */ if ((f == NULL) && (con->target)) { @@ -1445,6 +1455,21 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi orig_lp = con->cmd & LEAVE_PENDING ? 1 : 0; + /* For file RETR requests, we can write a WARC record. + We record the file contents to a temporary file. */ + if (warc_enabled && (con->cmd & DO_RETR)) + { + warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + return WARC_TMP_FOPENERR; + + if (!con->proxy && con->csock != -1) + { + warc_ip = (ip_address *) alloca (sizeof (ip_address)); + socket_ip_address (con->csock, warc_ip, ENDPOINT_PEER); + } + } + /* THE loop. */ do { @@ -1509,7 +1534,10 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi len = f->size; else len = 0; - err = getftp (u, len, &qtyread, restval, con, count); + + /* If we are working on a WARC record, getftp should also write + to the warc_tmp file. */ + err = getftp (u, len, &qtyread, restval, con, count, warc_tmp); if (con->csock == -1) con->st &= ~DONE_CWD; @@ -1520,8 +1548,10 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi { case HOSTERR: case CONIMPOSSIBLE: case FWRITEERR: case FOPENERR: case FTPNSFOD: case FTPLOGINC: case FTPNOPASV: case CONTNOTSUPPORTED: - case UNLINKERR: + case UNLINKERR: case WARC_TMP_FWRITEERR: /* Fatal errors, give up. */ + if (warc_tmp != NULL) + fclose (warc_tmp); return err; case CONSOCKERR: case CONERROR: case FTPSRVERR: case FTPRERR: case WRITEFAILED: case FTPUNKNOWNTYPE: case FTPSYSERR: @@ -1589,6 +1619,19 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi xfree (hurl); } + if (warc_enabled && (con->cmd & DO_RETR)) + { + /* Create and store a WARC resource record for the retrieved file. */ + bool warc_res; + + warc_res = warc_write_resource_record (NULL, u->url, NULL, NULL, + warc_ip, NULL, warc_tmp, -1); + if (! warc_res) + return WARC_ERR; + + /* warc_write_resource_record has also closed warc_tmp. */ + } + if ((con->cmd & DO_LIST)) /* This is a directory listing file. */ { @@ -1928,7 +1971,9 @@ Already have correct symlink %s -> %s\n\n"), xfree (ofile); /* Break on fatals. */ - if (err == QUOTEXC || err == HOSTERR || err == FWRITEERR) + if (err == QUOTEXC || err == HOSTERR || err == FWRITEERR + || err == WARC_ERR || err == WARC_TMP_FOPENERR + || err == WARC_TMP_FWRITEERR) break; con->cmd &= ~ (DO_CWD | DO_LOGIN); f = f->next; diff --git a/src/http.c b/src/http.c index 7eef453f..6a2ffe86 100644 --- a/src/http.c +++ b/src/http.c @@ -58,6 +58,7 @@ as that of the covered work. */ #include "md5.h" #include "convert.h" #include "spider.h" +#include "warc.h" #ifdef TESTING #include "test.h" @@ -320,10 +321,12 @@ request_remove_header (struct request *req, char *name) p += A_len; \ } while (0) -/* Construct the request and write it to FD using fd_write. */ +/* Construct the request and write it to FD using fd_write. + If warc_tmp is set to a file pointer, the request string will + also be written to that file. */ static int -request_send (const struct request *req, int fd) +request_send (const struct request *req, int fd, FILE *warc_tmp) { char *request_string, *p; int i, size, write_error; @@ -374,6 +377,13 @@ request_send (const struct request *req, int fd) if (write_error < 0) logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"), fd_errstr (fd)); + else if (warc_tmp != NULL) + { + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (request_string, 1, size - 1, warc_tmp); + if (warc_tmp_written != size - 1) + return -2; + } return write_error; } @@ -444,10 +454,12 @@ register_basic_auth_host (const char *hostname) /* Send the contents of FILE_NAME to SOCK. Make sure that exactly PROMISED_SIZE bytes are sent over the wire -- if the file is - longer, read only that much; if the file is shorter, report an error. */ + longer, read only that much; if the file is shorter, report an error. + If warc_tmp is set to a file pointer, the post data will + also be written to that file. */ static int -post_file (int sock, const char *file_name, wgint promised_size) +post_file (int sock, const char *file_name, wgint promised_size, FILE *warc_tmp) { static char chunk[8192]; wgint written = 0; @@ -472,6 +484,16 @@ post_file (int sock, const char *file_name, wgint promised_size) fclose (fp); return -1; } + if (warc_tmp != NULL) + { + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (chunk, 1, towrite, warc_tmp); + if (warc_tmp_written != towrite) + { + fclose (fp); + return -2; + } + } written += towrite; } fclose (fp); @@ -1462,6 +1484,135 @@ File %s already there; not retrieving.\n\n"), quote (filename)); *dt |= TEXTHTML; } +/* Download the response body from the socket and writes it to + an output file. The headers have already been read from the + socket. If WARC is enabled, the response body will also be + written to a WARC response record. + + hs, contlen, contrange, chunked_transfer_encoding and url are + parameters from the gethttp method. fp is a pointer to the + output file. + + url, warc_timestamp_str, warc_request_uuid, warc_ip, type + and statcode will be saved in the headers of the WARC record. + The head parameter contains the HTTP headers of the response. + + If fp is NULL and WARC is enabled, the response body will be + written only to the WARC file. If WARC is disabled and fp + is a file pointer, the data will be written to the file. + If fp is a file pointer and WARC is enabled, the body will + be written to both destinations. + + Returns the error code. */ +static int +read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen, + wgint contrange, bool chunked_transfer_encoding, + char *url, char *warc_timestamp_str, char *warc_request_uuid, + ip_address *warc_ip, char *type, int statcode, char *head) +{ + int warc_payload_offset = 0; + FILE *warc_tmp = NULL; + int warcerr = 0; + + if (opt.warc_filename != NULL) + { + /* Open a temporary file where we can write the response before we + add it to the WARC record. */ + warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + warcerr = WARC_TMP_FOPENERR; + + if (warcerr == 0) + { + /* We should keep the response headers for the WARC record. */ + int head_len = strlen (head); + int warc_tmp_written = fwrite (head, 1, head_len, warc_tmp); + if (warc_tmp_written != head_len) + warcerr = WARC_TMP_FWRITEERR; + warc_payload_offset = head_len; + } + + if (warcerr != 0) + { + if (warc_tmp != NULL) + fclose (warc_tmp); + return warcerr; + } + } + + if (fp != NULL) + { + /* This confuses the timestamping code that checks for file size. + #### The timestamping code should be smarter about file size. */ + if (opt.save_headers && hs->restval == 0) + fwrite (head, 1, strlen (head), fp); + } + + /* Read the response body. */ + int flags = 0; + if (contlen != -1) + /* If content-length is present, read that much; otherwise, read + until EOF. The HTTP spec doesn't require the server to + actually close the connection when it's done sending data. */ + flags |= rb_read_exactly; + if (fp != NULL && hs->restval > 0 && contrange == 0) + /* If the server ignored our range request, instruct fd_read_body + to skip the first RESTVAL bytes of body. */ + flags |= rb_skip_startpos; + if (chunked_transfer_encoding) + flags |= rb_chunked_transfer_encoding; + + hs->len = hs->restval; + hs->rd_size = 0; + /* Download the response body and write it to fp. + If we are working on a WARC file, we simultaneously write the + response body to warc_tmp. */ + hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0, + hs->restval, &hs->rd_size, &hs->len, &hs->dltime, + flags, warc_tmp); + if (hs->res >= 0) + { + if (warc_tmp != NULL) + { + /* Create a response record and write it to the WARC file. + Note: per the WARC standard, the request and response should share + the same date header. We re-use the timestamp of the request. + The response record should also refer to the uuid of the request. */ + bool r = warc_write_response_record (url, warc_timestamp_str, + warc_request_uuid, warc_ip, + warc_tmp, warc_payload_offset, + type, statcode, hs->newloc); + + /* warc_write_response_record has closed warc_tmp. */ + + if (! r) + return WARC_ERR; + } + + return RETRFINISHED; + } + + if (warc_tmp != NULL) + fclose (warc_tmp); + + if (hs->res == -2) + { + /* Error while writing to fd. */ + return FWRITEERR; + } + else if (hs->res == -3) + { + /* Error while writing to warc_tmp. */ + return WARC_TMP_FWRITEERR; + } + else + { + /* A read error! */ + hs->rderrmsg = xstrdup (fd_errstr (sock)); + return RETRFINISHED; + } +} + #define BEGINS_WITH(line, string_constant) \ (!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \ && (c_isspace (line[sizeof (string_constant) - 1]) \ @@ -1519,9 +1670,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, wgint contlen, contrange; struct url *conn; FILE *fp; + int err; int sock = -1; - int flags; /* Set to 1 when the authorization has already been sent and should not be tried again. */ @@ -1547,6 +1698,14 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, char hdrval[256]; char *message; + /* Declare WARC variables. */ + bool warc_enabled = (opt.warc_filename != NULL); + FILE *warc_tmp = NULL; + char warc_timestamp_str [21]; + char warc_request_uuid [48]; + ip_address *warc_ip = NULL; + long int warc_payload_offset = -1; + /* Whether this connection will be kept alive after the HTTP request is done. */ bool keep_alive; @@ -1852,7 +2011,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, that the contents of Host would be exactly the same as the contents of CONNECT. */ - write_error = request_send (connreq, sock); + write_error = request_send (connreq, sock, 0); request_free (connreq); if (write_error < 0) { @@ -1924,8 +2083,26 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, #endif /* HAVE_SSL */ } + /* Open the temporary file where we will write the request. */ + if (warc_enabled) + { + warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + { + CLOSE_INVALIDATE (sock); + request_free (req); + return WARC_TMP_FOPENERR; + } + + if (! proxy) + { + warc_ip = (ip_address *) alloca (sizeof (ip_address)); + socket_ip_address (sock, warc_ip, ENDPOINT_PEER); + } + } + /* Send the request to server. */ - write_error = request_send (req, sock); + write_error = request_send (req, sock, warc_tmp); if (write_error >= 0) { @@ -1933,16 +2110,39 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, { DEBUGP (("[POST data: %s]\n", opt.post_data)); write_error = fd_write (sock, opt.post_data, post_data_size, -1); + if (write_error >= 0 && warc_tmp != NULL) + { + /* Remember end of headers / start of payload. */ + warc_payload_offset = ftell (warc_tmp); + + /* Write a copy of the data to the WARC record. */ + int warc_tmp_written = fwrite (opt.post_data, 1, post_data_size, warc_tmp); + if (warc_tmp_written != post_data_size) + write_error = -2; + } } else if (opt.post_file_name && post_data_size != 0) - write_error = post_file (sock, opt.post_file_name, post_data_size); + { + if (warc_tmp != NULL) + /* Remember end of headers / start of payload. */ + warc_payload_offset = ftell (warc_tmp); + + write_error = post_file (sock, opt.post_file_name, post_data_size, warc_tmp); + } } if (write_error < 0) { CLOSE_INVALIDATE (sock); request_free (req); - return WRITEFAILED; + + if (warc_tmp != NULL) + fclose (warc_tmp); + + if (write_error == -2) + return WARC_TMP_FWRITEERR; + else + return WRITEFAILED; } logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "), proxy ? "Proxy" : "HTTP"); @@ -1950,6 +2150,29 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, contrange = 0; *dt &= ~RETROKF; + + if (warc_enabled) + { + bool warc_result; + /* Generate a timestamp and uuid for this request. */ + warc_timestamp (warc_timestamp_str); + warc_uuid_str (warc_request_uuid); + + /* Create a request record and store it in the WARC file. */ + warc_result = warc_write_request_record (u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, + warc_tmp, warc_payload_offset); + if (! warc_result) + { + CLOSE_INVALIDATE (sock); + request_free (req); + return WARC_ERR; + } + + /* warc_write_request_record has also closed warc_tmp. */ + } + + read_header: head = read_http_response_head (sock); if (!head) @@ -2073,11 +2296,42 @@ read_header: if (statcode == HTTP_STATUS_UNAUTHORIZED) { /* Authorization is required. */ - if (keep_alive && !head_only - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - CLOSE_FINISH (sock); + + /* Normally we are not interested in the response body. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err; + type = resp_header_strdup (resp, "Content-Type"); + err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + xfree_null (type); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + request_free (req); + xfree_null (message); + resp_free (resp); + xfree (head); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (keep_alive && !head_only + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + pconn.authorized = false; if (!auth_finished && (user && passwd)) { @@ -2325,11 +2579,42 @@ read_header: _("Location: %s%s\n"), hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"), hs->newloc ? _(" [following]") : ""); - if (keep_alive && !head_only - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - CLOSE_FINISH (sock); + + /* In case the caller cares to look... */ + hs->len = 0; + hs->res = 0; + hs->restval = 0; + + /* Normally we are not interested in the response body of a redirect. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + xfree_null (type); + xfree (head); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (keep_alive && !head_only + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + xfree_null (type); xfree (head); /* From RFC2616: The status codes 303 and 307 have @@ -2447,8 +2732,6 @@ read_header: logputs (LOG_VERBOSE, "\n"); } } - xfree_null (type); - type = NULL; /* We don't need it any more. */ /* Return if we have no intention of further downloading. */ if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only) @@ -2456,21 +2739,48 @@ read_header: /* In case the caller cares to look... */ hs->len = 0; hs->res = 0; - xfree_null (type); - if (head_only) - /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the - servers not to send body in response to a HEAD request, and - those that do will likely be caught by test_socket_open. - If not, they can be worked around using - `--no-http-keep-alive'. */ - CLOSE_FINISH (sock); - else if (keep_alive - && skip_short_body (sock, contlen, chunked_transfer_encoding)) - /* Successfully skipped the body; also keep using the socket. */ - CLOSE_FINISH (sock); + hs->restval = 0; + + /* Normally we are not interested in the response body of a error responses. + But if we are writing a WARC file we are: we like to keep everyting. */ + if (warc_enabled) + { + int err = read_response_body (hs, sock, NULL, contlen, 0, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); + + if (err != RETRFINISHED || hs->res < 0) + { + CLOSE_INVALIDATE (sock); + xfree (head); + xfree_null (type); + return err; + } + else + CLOSE_FINISH (sock); + } else - CLOSE_INVALIDATE (sock); + { + /* Since WARC is disabled, we are not interested in the response body. */ + if (head_only) + /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the + servers not to send body in response to a HEAD request, and + those that do will likely be caught by test_socket_open. + If not, they can be worked around using + `--no-http-keep-alive'. */ + CLOSE_FINISH (sock); + else if (keep_alive + && skip_short_body (sock, contlen, chunked_transfer_encoding)) + /* Successfully skipped the body; also keep using the socket. */ + CLOSE_FINISH (sock); + else + CLOSE_INVALIDATE (sock); + } + xfree (head); + xfree_null (type); return RETRFINISHED; } @@ -2512,6 +2822,7 @@ read_header: strerror (errno)); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return UNLINKERR; } } @@ -2539,6 +2850,7 @@ read_header: hs->local_file); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return FOPEN_EXCL_ERR; } } @@ -2547,6 +2859,7 @@ read_header: logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno)); CLOSE_INVALIDATE (sock); xfree (head); + xfree_null (type); return FOPENERR; } } @@ -2560,49 +2873,26 @@ read_header: HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file)); } - /* This confuses the timestamping code that checks for file size. - #### The timestamping code should be smarter about file size. */ - if (opt.save_headers && hs->restval == 0) - fwrite (head, 1, strlen (head), fp); + + err = read_response_body (hs, sock, fp, contlen, contrange, + chunked_transfer_encoding, + u->url, warc_timestamp_str, + warc_request_uuid, warc_ip, type, + statcode, head); /* Now we no longer need to store the response header. */ xfree (head); - - /* Download the request body. */ - flags = 0; - if (contlen != -1) - /* If content-length is present, read that much; otherwise, read - until EOF. The HTTP spec doesn't require the server to - actually close the connection when it's done sending data. */ - flags |= rb_read_exactly; - if (hs->restval > 0 && contrange == 0) - /* If the server ignored our range request, instruct fd_read_body - to skip the first RESTVAL bytes of body. */ - flags |= rb_skip_startpos; - - if (chunked_transfer_encoding) - flags |= rb_chunked_transfer_encoding; - - hs->len = hs->restval; - hs->rd_size = 0; - hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0, - hs->restval, &hs->rd_size, &hs->len, &hs->dltime, - flags); + xfree_null (type); if (hs->res >= 0) CLOSE_FINISH (sock); else - { - if (hs->res < 0) - hs->rderrmsg = xstrdup (fd_errstr (sock)); - CLOSE_INVALIDATE (sock); - } + CLOSE_INVALIDATE (sock); if (!output_stream) fclose (fp); - if (hs->res == -2) - return FWRITEERR; - return RETRFINISHED; + + return err; } /* The genuine HTTP loop! This is the part where the retrieval is @@ -2626,6 +2916,12 @@ http_loop (struct url *u, struct url *original_url, char **newloc, char *file_name; bool force_full_retrieve = false; + + /* If we are writing to a WARC file: always retrieve the whole file. */ + if (opt.warc_filename != NULL) + force_full_retrieve = true; + + /* Assert that no value for *LOCAL_FILE was passed. */ assert (local_file == NULL || *local_file == NULL); @@ -2795,6 +3091,18 @@ Spider mode enabled. Check if remote file exists.\n")); /* Fatal errors just return from the function. */ ret = err; goto exit; + case WARC_ERR: + /* A fatal WARC error. */ + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, _("Cannot write to WARC file..\n")); + ret = err; + goto exit; + case WARC_TMP_FOPENERR: case WARC_TMP_FWRITEERR: + /* A fatal WARC error. */ + logputs (LOG_VERBOSE, "\n"); + logprintf (LOG_NOTQUIET, _("Cannot write to temporary WARC file.\n")); + ret = err; + goto exit; case CONSSLERR: /* Another fatal error. */ logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n")); diff --git a/src/init.c b/src/init.c index eae35523..47fdea06 100644 --- a/src/init.c +++ b/src/init.c @@ -88,6 +88,7 @@ CMD_DECLARE (cmd_vector); CMD_DECLARE (cmd_spec_dirstruct); CMD_DECLARE (cmd_spec_header); +CMD_DECLARE (cmd_spec_warc_header); CMD_DECLARE (cmd_spec_htmlify); CMD_DECLARE (cmd_spec_mirror); CMD_DECLARE (cmd_spec_prefer_family); @@ -264,6 +265,15 @@ static const struct { { "verbose", NULL, cmd_spec_verbose }, { "wait", &opt.wait, cmd_time }, { "waitretry", &opt.waitretry, cmd_time }, + { "warccdx", &opt.warc_cdx_enabled, cmd_boolean }, + { "warccdxdedup", &opt.warc_cdx_dedup_filename, cmd_file }, + { "warccompression", &opt.warc_compression_enabled, cmd_boolean }, + { "warcdigests", &opt.warc_digests_enabled, cmd_boolean }, + { "warcfile", &opt.warc_filename, cmd_file }, + { "warcheader", NULL, cmd_spec_warc_header }, + { "warckeeplog", &opt.warc_keep_log, cmd_boolean }, + { "warcmaxsize", &opt.warc_maxsize, cmd_bytes }, + { "warctempdir", &opt.warc_tempdir, cmd_directory }, #ifdef USE_WATT32 { "wdebug", &opt.wdebug, cmd_boolean }, #endif @@ -362,6 +372,14 @@ defaults (void) opt.useservertimestamps = true; opt.show_all_dns_entries = false; + + opt.warc_maxsize = 0; /* 1024 * 1024 * 1024; */ + opt.warc_compression_enabled = true; + opt.warc_digests_enabled = true; + opt.warc_cdx_enabled = false; + opt.warc_cdx_dedup_filename = NULL; + opt.warc_tempdir = NULL; + opt.warc_keep_log = true; } /* Return the user's home directory (strdup-ed), or NULL if none is @@ -1235,6 +1253,27 @@ cmd_spec_header (const char *com, const char *val, void *place_ignored) return true; } +static bool +cmd_spec_warc_header (const char *com, const char *val, void *place_ignored) +{ + /* Empty value means reset the list of headers. */ + if (*val == '\0') + { + free_vec (opt.warc_user_headers); + opt.warc_user_headers = NULL; + return true; + } + + if (!check_user_specified_header (val)) + { + fprintf (stderr, _("%s: %s: Invalid WARC header %s.\n"), + exec_name, com, quote (val)); + return false; + } + opt.warc_user_headers = vec_append (opt.warc_user_headers, val); + return true; +} + static bool cmd_spec_htmlify (const char *com, const char *val, void *place_ignored) { @@ -1639,6 +1678,7 @@ cleanup (void) xfree_null (opt.http_user); xfree_null (opt.http_passwd); free_vec (opt.user_headers); + free_vec (opt.warc_user_headers); # ifdef HAVE_SSL xfree_null (opt.cert_file); xfree_null (opt.private_key); diff --git a/src/log.c b/src/log.c index e6875f6b..0185df19 100644 --- a/src/log.c +++ b/src/log.c @@ -79,6 +79,10 @@ as that of the covered work. */ logging is inhibited, logfp is set back to NULL. */ static FILE *logfp; +/* A second file descriptor pointing to the temporary log file for the + WARC writer. If WARC writing is disabled, this is NULL. */ +static FILE *warclogfp; + /* If true, it means logging is inhibited, i.e. nothing is printed or stored. */ static bool inhibit_logging; @@ -304,6 +308,31 @@ get_log_fp (void) return logfp; return stderr; } + +/* Returns the file descriptor for the secondary log file. This is + WARCLOGFP, except if called before log_init, in which case it + returns stderr. This is useful in case someone calls a logging + function before log_init. + + If logging is inhibited, return NULL. */ + +static FILE * +get_warc_log_fp (void) +{ + if (inhibit_logging) + return NULL; + if (warclogfp) + return warclogfp; + return NULL; +} + +/* Sets the file descriptor for the secondary log file. */ + +void +log_set_warc_log_fp (FILE * fp) +{ + warclogfp = fp; +} /* Log a literal string S. The string is logged as-is, without a newline appended. */ @@ -312,13 +341,17 @@ void logputs (enum log_options o, const char *s) { FILE *fp; + FILE *warcfp; check_redirect_output (); if ((fp = get_log_fp ()) == NULL) return; + warcfp = get_warc_log_fp (); CHECK_VERBOSE (o); FPUTS (s, fp); + if (warcfp != NULL) + FPUTS (s, warcfp); if (save_context_p) saved_append (s); if (flush_log_p) @@ -356,8 +389,9 @@ log_vprintf_internal (struct logvprintf_state *state, const char *fmt, int available_size = sizeof (smallmsg); int numwritten; FILE *fp = get_log_fp (); + FILE *warcfp = get_warc_log_fp (); - if (!save_context_p) + if (!save_context_p && warcfp == NULL) { /* In the simple case just call vfprintf(), to avoid needless allocation and games with vsnprintf(). */ @@ -407,8 +441,11 @@ log_vprintf_internal (struct logvprintf_state *state, const char *fmt, } /* Writing succeeded. */ - saved_append (write_ptr); + if (save_context_p) + saved_append (write_ptr); FPUTS (write_ptr, fp); + if (warcfp != NULL) + FPUTS (write_ptr, warcfp); if (state->bigmsg) xfree (state->bigmsg); @@ -426,6 +463,7 @@ void logflush (void) { FILE *fp = get_log_fp (); + FILE *warcfp = get_warc_log_fp (); if (fp) { /* 2005-10-25 SMS. @@ -440,6 +478,10 @@ logflush (void) fflush (fp); #endif /* def __VMS [else] */ } + + if (warcfp != NULL) + fflush (warcfp); + needs_flushing = false; } @@ -598,6 +640,7 @@ log_dump_context (void) { int num = log_line_current; FILE *fp = get_log_fp (); + FILE *warcfp = get_warc_log_fp (); if (!fp) return; @@ -609,14 +652,23 @@ log_dump_context (void) { struct log_ln *ln = log_lines + num; if (ln->content) - FPUTS (ln->content, fp); + { + FPUTS (ln->content, fp); + if (warcfp != NULL) + FPUTS (ln->content, warcfp); + } ROT_ADVANCE (num); } while (num != log_line_current); if (trailing_line) if (log_lines[log_line_current].content) - FPUTS (log_lines[log_line_current].content, fp); + { + FPUTS (log_lines[log_line_current].content, fp); + if (warcfp != NULL) + FPUTS (log_lines[log_line_current].content, warcfp); + } fflush (fp); + fflush (warcfp); } /* String escape functions. */ diff --git a/src/log.h b/src/log.h index 48c2f1b1..d74ca53d 100644 --- a/src/log.h +++ b/src/log.h @@ -34,8 +34,12 @@ as that of the covered work. */ /* The log file to which Wget writes to after HUP. */ #define DEFAULT_LOGFILE "wget-log" +#include + enum log_options { LOG_VERBOSE, LOG_NOTQUIET, LOG_NONVERBOSE, LOG_ALWAYS }; +void log_set_warc_log_fp (FILE *); + void logprintf (enum log_options, const char *, ...) GCC_FORMAT_ATTR (2, 3); void debug_logprintf (const char *, ...) GCC_FORMAT_ATTR (1, 2); diff --git a/src/main.c b/src/main.c index 05ad0e76..28467359 100644 --- a/src/main.c +++ b/src/main.c @@ -55,6 +55,7 @@ as that of the covered work. */ #include "spider.h" #include "http.h" /* for save_cookies */ #include "ptimer.h" +#include "warc.h" #include #include @@ -287,6 +288,15 @@ static struct cmdline_option option_data[] = { "version", 'V', OPT_FUNCALL, (void *) print_version, no_argument }, { "wait", 'w', OPT_VALUE, "wait", -1 }, { "waitretry", 0, OPT_VALUE, "waitretry", -1 }, + { "warc-cdx", 0, OPT_BOOLEAN, "warccdx", -1 }, + { "warc-compression", 0, OPT_BOOLEAN, "warccompression", -1 }, + { "warc-dedup", 0, OPT_VALUE, "warccdxdedup", -1 }, + { "warc-digests", 0, OPT_BOOLEAN, "warcdigests", -1 }, + { "warc-file", 0, OPT_VALUE, "warcfile", -1 }, + { "warc-header", 0, OPT_VALUE, "warcheader", -1 }, + { "warc-keep-log", 0, OPT_BOOLEAN, "warckeeplog", -1 }, + { "warc-max-size", 0, OPT_VALUE, "warcmaxsize", -1 }, + { "warc-tempdir", 0, OPT_VALUE, "warctempdir", -1 }, #ifdef USE_WATT32 { "wdebug", 0, OPT_BOOLEAN, "wdebug", -1 }, #endif @@ -652,6 +662,29 @@ FTP options:\n"), --retr-symlinks when recursing, get linked-to files (not dir).\n"), "\n", + N_("\ +WARC options:\n"), + N_("\ + --warc-file=FILENAME save request/response data to a .warc.gz file.\n"), + N_("\ + --warc-header=STRING insert STRING into the warcinfo record.\n"), + N_("\ + --warc-max-size=NUMBER set maximum size of WARC files to NUMBER.\n"), + N_("\ + --warc-cdx write CDX index files.\n"), + N_("\ + --warc-dedup=FILENAME do not store records listed in this CDX file.\n"), + N_("\ + --no-warc-compression do not compress WARC files with GZIP.\n"), + N_("\ + --no-warc-digests do not calculate SHA1 digests.\n"), + N_("\ + --no-warc-keep-log do not store the log file in a WARC record.\n"), + N_("\ + --warc-tempdir=DIRECTORY location for temporary files created by the\n\ + WARC writer.\n"), + "\n", + N_("\ Recursive download:\n"), N_("\ @@ -910,6 +943,7 @@ There is NO WARRANTY, to the extent permitted by law.\n"), stdout) < 0) } char *program_name; /* Needed by lib/error.c. */ +char *program_argstring; /* Needed by wget_warc.c. */ int main (int argc, char **argv) @@ -945,6 +979,22 @@ main (int argc, char **argv) windows_main ((char **) &exec_name); #endif + /* Construct the arguments string. */ + int argstring_length = 1; + for (i = 1; i < argc; i++) + argstring_length += strlen (argv[i]) + 2 + 1; + char *p = program_argstring = malloc (argstring_length * sizeof (char)); + for (i = 1; i < argc; i++) + { + *p++ = '"'; + int arglen = strlen (argv[i]); + memcpy (p, argv[i], arglen); + p += arglen; + *p++ = '"'; + *p++ = ' '; + } + *p = '\0'; + /* Load the hard-coded defaults. */ defaults (); @@ -1194,6 +1244,47 @@ for details.\n\n")); } } + if (opt.warc_filename != 0) + { + if (opt.noclobber) + { + fprintf (stderr, + _("WARC output does not work with --no-clobber, " + "--no-clobber will be disabled.\n")); + opt.noclobber = false; + } + if (opt.timestamping) + { + fprintf (stderr, + _("WARC output does not work with timestamping, " + "timestamping will be disabled.\n")); + opt.timestamping = false; + } + if (opt.spider) + { + fprintf (stderr, + _("WARC output does not work with --spider.\n")); + exit (1); + } + if (opt.always_rest) + { + fprintf (stderr, + _("WARC output does not work with --continue, " + "--continue will be disabled.\n")); + opt.always_rest = false; + } + if (opt.warc_cdx_dedup_filename != 0 && !opt.warc_digests_enabled) + { + fprintf (stderr, + _("Digests are disabled; WARC deduplication will " + "not find duplicate records.\n")); + } + if (opt.warc_keep_log) + { + opt.progress_type = "dot"; + } + } + if (opt.ask_passwd && opt.passwd) { fprintf (stderr, @@ -1273,6 +1364,10 @@ for details.\n\n")); /* Initialize logging. */ log_init (opt.lfilename, append_to_log); + /* Open WARC file. */ + if (opt.warc_filename != 0) + warc_init (); + DEBUGP (("DEBUG output created by Wget %s on %s.\n\n", version_string, OS_TYPE)); @@ -1472,7 +1567,12 @@ outputting to a regular file.\n")); if (opt.convert_links && !opt.delete_after) convert_all_links (); + /* Close WARC file. */ + if (opt.warc_filename != 0) + warc_close (); + log_close (); + for (i = 0; i < nurl; i++) xfree (url[i]); cleanup (); diff --git a/src/options.h b/src/options.h index 5e7c1eb6..0be66814 100644 --- a/src/options.h +++ b/src/options.h @@ -87,6 +87,15 @@ struct options FTP. */ char *output_document; /* The output file to which the documents will be printed. */ + char *warc_filename; /* WARC output filename */ + char *warc_tempdir; /* WARC temp dir */ + char *warc_cdx_dedup_filename; /* CDX file to be used for deduplication. */ + wgint warc_maxsize; /* WARC max archive size */ + bool warc_compression_enabled; /* For GZIP compression. */ + bool warc_digests_enabled; /* For SHA1 digests. */ + bool warc_cdx_enabled; /* Create CDX files? */ + bool warc_keep_log; /* Store the log file in a WARC record. */ + char **warc_user_headers; /* User-defined WARC header(s). */ char *user; /* Generic username */ char *passwd; /* Generic password */ diff --git a/src/retr.c b/src/retr.c index 73947658..3df582b8 100644 --- a/src/retr.c +++ b/src/retr.c @@ -139,13 +139,16 @@ limit_bandwidth (wgint bytes, struct ptimer *timer) /* Write data in BUF to OUT. However, if *SKIP is non-zero, skip that amount of data and decrease SKIP. Increment *TOTAL by the amount - of data written. */ + of data written. If OUT2 is not NULL, also write BUF to OUT2. + In case of error writing to OUT, -1 is returned. In case of error + writing to OUT2, -2 is returned. In case of any other error, + 1 is returned. */ static int -write_data (FILE *out, const char *buf, int bufsize, wgint *skip, - wgint *written) +write_data (FILE *out, FILE *out2, const char *buf, int bufsize, + wgint *skip, wgint *written) { - if (!out) + if (out == NULL && out2 == NULL) return 1; if (*skip > bufsize) { @@ -161,7 +164,10 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip, return 1; } - fwrite (buf, 1, bufsize, out); + if (out != NULL) + fwrite (buf, 1, bufsize, out); + if (out2 != NULL) + fwrite (buf, 1, bufsize, out2); *written += bufsize; /* Immediately flush the downloaded data. This should not hinder @@ -178,9 +184,17 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip, actual justification. (Also, why 16K? Anyone test other values?) */ #ifndef __VMS - fflush (out); + if (out != NULL) + fflush (out); + if (out2 != NULL) + fflush (out2); #endif /* ndef __VMS */ - return !ferror (out); + if (out != NULL && ferror (out)) + return -1; + else if (out2 != NULL && ferror (out2)) + return -2; + else + return 0; } /* Read the contents of file descriptor FD until it the connection @@ -198,13 +212,17 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip, the amount of data written to disk. The time it took to download the data is stored to ELAPSED. + If OUT2 is non-NULL, the contents is also written to OUT2. + The function exits and returns the amount of data read. In case of error while reading data, -1 is returned. In case of error while - writing data, -2 is returned. */ + writing data to OUT, -2 is returned. In case of error while writing + data to OUT2, -3 is returned. */ int fd_read_body (int fd, FILE *out, wgint toread, wgint startpos, - wgint *qtyread, wgint *qtywritten, double *elapsed, int flags) + wgint *qtyread, wgint *qtywritten, double *elapsed, int flags, + FILE *out2) { int ret = 0; #undef max @@ -343,9 +361,10 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos, if (ret > 0) { sum_read += ret; - if (!write_data (out, dlbuf, ret, &skip, &sum_written)) + int write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written); + if (write_res != 0) { - ret = -2; + ret = (write_res == -3) ? -3 : -2; goto out; } if (chunked) diff --git a/src/retr.h b/src/retr.h index 7329b037..22ab9ecd 100644 --- a/src/retr.h +++ b/src/retr.h @@ -50,7 +50,7 @@ enum { rb_chunked_transfer_encoding = 4 }; -int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int); +int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int, FILE *); typedef const char *(*hunk_terminator_t) (const char *, const char *, int); diff --git a/src/test.c b/src/test.c index e7ce54cf..80abafff 100644 --- a/src/test.c +++ b/src/test.c @@ -46,6 +46,8 @@ const char *test_append_uri_pathel(); const char *test_are_urls_equal(); const char *test_is_robots_txt_url(); +const char *program_argstring = "TEST"; + int tests_run; static const char * diff --git a/src/warc.c b/src/warc.c new file mode 100644 index 00000000..77ef3692 --- /dev/null +++ b/src/warc.c @@ -0,0 +1,1332 @@ +/* Utility functions for writing WARC files. */ +#define _GNU_SOURCE + +#include "wget.h" +#include "hash.h" +#include "utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_LIBUUID +#include +#endif + +#include "warc.h" + +extern char *version_string; + +/* Set by main in main.c */ +extern char *program_argstring; + + +/* The log file (a temporary file that contains a copy + of the wget log). */ +static FILE *warc_log_fp; + +/* The manifest file (a temporary file that contains the + warcinfo uuid of every file in this crawl). */ +static FILE *warc_manifest_fp; + +/* The current WARC file (or NULL, if WARC is disabled). */ +static FILE *warc_current_file; + +/* The gzip stream for the current WARC file + (or NULL, if WARC or gzip is disabled). */ +static gzFile *warc_current_gzfile; + +/* The offset of the current gzip record in the WARC file. */ +static size_t warc_current_gzfile_offset; + +/* The uncompressed size (so far) of the current record. */ +static size_t warc_current_gzfile_uncompressed_size; + +/* This is true until a warc_write_* method fails. */ +static bool warc_write_ok; + +/* The current CDX file (or NULL, if CDX is disabled). */ +static FILE *warc_current_cdx_file; + +/* The record id of the warcinfo record of the current WARC file. */ +static char *warc_current_warcinfo_uuid_str; + +/* The file name of the current WARC file. */ +static char *warc_current_filename; + +/* The serial number of the current WARC file. This number is + incremented each time a new file is opened and is used in the + WARC file's filename. */ +static int warc_current_file_number; + +/* The table of CDX records, if deduplication is enabled. */ +struct hash_table * warc_cdx_dedup_table; + +static bool warc_start_new_file (bool meta); + + +struct warc_cdx_record +{ + char *url; + char *uuid; + char digest[SHA1_DIGEST_SIZE]; +}; + +static unsigned long +warc_hash_sha1_digest (const void *key) +{ + /* We just use some of the first bytes of the digest. */ + unsigned long v = 0; + memcpy (&v, key, sizeof (unsigned long)); + return v; +} + +static int +warc_cmp_sha1_digest (const void *digest1, const void *digest2) +{ + return !memcmp (digest1, digest2, SHA1_DIGEST_SIZE); +} + + + +/* Writes SIZE bytes from BUFFER to the current WARC file, + through gzwrite if compression is enabled. + Returns the number of uncompressed bytes written. */ +static size_t +warc_write_buffer (const char *buffer, size_t size) +{ + if (warc_current_gzfile) + { + warc_current_gzfile_uncompressed_size += size; + return gzwrite (warc_current_gzfile, buffer, size); + } + else + return fwrite (buffer, 1, size, warc_current_file); +} + +/* Writes STR to the current WARC file. + Returns false and set warc_write_ok to false if there + is an error. */ +static bool +warc_write_string (const char *str) +{ + if (!warc_write_ok) + return false; + + size_t n = strlen (str); + if (n != warc_write_buffer (str, n)) + warc_write_ok = false; + + return warc_write_ok; +} + + +#define EXTRA_GZIP_HEADER_SIZE 12 +#define GZIP_STATIC_HEADER_SIZE 10 +#define FLG_FEXTRA 0x04 +#define OFF_FLG 3 + +/* Starts a new WARC record. Writes the version header. + If opt.warc_maxsize is set and the current file is becoming + too large, this will open a new WARC file. + + If compression is enabled, this will start a new + gzip stream in the current WARC file. + + Returns false and set warc_write_ok to false if there + is an error. */ +static bool +warc_write_start_record () +{ + if (!warc_write_ok) + return false; + + fflush (warc_current_file); + if (opt.warc_maxsize > 0 && ftell (warc_current_file) >= opt.warc_maxsize) + warc_start_new_file (false); + + /* Start a GZIP stream, if required. */ + if (opt.warc_compression_enabled) + { + /* Record the starting offset of the new record. */ + warc_current_gzfile_offset = ftell (warc_current_file); + + /* Reserve space for the extra GZIP header field. + In warc_write_end_record we will fill this space + with information about the uncompressed and + compressed size of the record. */ + fprintf (warc_current_file, "XXXXXXXXXXXX"); + fflush (warc_current_file); + + /* Start a new GZIP stream. */ + warc_current_gzfile = gzdopen (dup (fileno (warc_current_file)), "wb+9"); + warc_current_gzfile_uncompressed_size = 0; + + if (warc_current_gzfile == NULL) + { + logprintf (LOG_NOTQUIET, _("Error opening GZIP stream to WARC file.\n")); + warc_write_ok = false; + return false; + } + } + + warc_write_string ("WARC/1.0\r\n"); + return warc_write_ok; +} + +/* Writes a WARC header to the current WARC record. + This method may be run after warc_write_start_record and + before warc_write_block_from_file. */ +static bool +warc_write_header (const char *name, const char *value) +{ + if (value) + { + warc_write_string (name); + warc_write_string (": "); + warc_write_string (value); + warc_write_string ("\r\n"); + } + return warc_write_ok; +} + +/* Copies the contents of DATA_IN to the WARC record. + Adds a Content-Length header to the WARC record. + Run this method after warc_write_header, + then run warc_write_end_record. */ +static bool +warc_write_block_from_file (FILE *data_in) +{ + /* Add the Content-Length header. */ + char *content_length; + fseek (data_in, 0L, SEEK_END); + if (! asprintf (&content_length, "%ld", ftell (data_in))) + { + warc_write_ok = false; + return false; + } + warc_write_header ("Content-Length", content_length); + free (content_length); + + /* End of the WARC header section. */ + warc_write_string ("\r\n"); + + if (fseek (data_in, 0L, SEEK_SET) != 0) + warc_write_ok = false; + + /* Copy the data in the file to the WARC record. */ + char buffer[BUFSIZ]; + size_t s; + while (warc_write_ok && (s = fread (buffer, 1, BUFSIZ, data_in)) > 0) + { + if (warc_write_buffer (buffer, s) < s) + warc_write_ok = false; + } + + return warc_write_ok; +} + +/* Run this method to close the current WARC record. + + If compression is enabled, this method closes the + current GZIP stream and fills the extra GZIP header + with the uncompressed and compressed length of the + record. */ +static bool +warc_write_end_record () +{ + warc_write_buffer ("\r\n\r\n", 4); + + /* We start a new gzip stream for each record. */ + if (warc_write_ok && warc_current_gzfile) + { + if (gzclose (warc_current_gzfile) != Z_OK) + { + warc_write_ok = false; + return false; + } + + fflush (warc_current_file); + fseek (warc_current_file, 0, SEEK_END); + + /* The WARC standard suggests that we add 'skip length' data in the + extra header field of the GZIP stream. + + In warc_write_start_record we reserved space for this extra header. + This extra space starts at warc_current_gzfile_offset and fills + EXTRA_GZIP_HEADER_SIZE bytes. The static GZIP header starts at + warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE. + + We need to do three things: + 1. Move the static GZIP header to warc_current_gzfile_offset; + 2. Set the FEXTRA flag in the GZIP header; + 3. Write the extra GZIP header after the static header, that is, + starting at warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE. + */ + + /* Calculate the uncompressed and compressed sizes. */ + size_t current_offset = ftell (warc_current_file); + size_t uncompressed_size = current_offset - warc_current_gzfile_offset; + size_t compressed_size = warc_current_gzfile_uncompressed_size; + + /* Go back to the static GZIP header. */ + fseek (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET); + + /* Read the header. */ + char static_header[GZIP_STATIC_HEADER_SIZE]; + size_t result = fread (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file); + if (result != GZIP_STATIC_HEADER_SIZE) + { + warc_write_ok = false; + return false; + } + + /* Set the FEXTRA flag in the flags byte of the header. */ + static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA; + + /* Write the header back to the file, but starting at warc_current_gzfile_offset. */ + fseek (warc_current_file, warc_current_gzfile_offset, SEEK_SET); + fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file); + + /* Prepare the extra GZIP header. */ + char extra_header[EXTRA_GZIP_HEADER_SIZE]; + /* XLEN, the length of the extra header fields. */ + extra_header[0] = ((EXTRA_GZIP_HEADER_SIZE - 2) & 255); + extra_header[1] = ((EXTRA_GZIP_HEADER_SIZE - 2) >> 8) & 255; + /* The extra header field identifier for the WARC skip length. */ + extra_header[2] = 's'; + extra_header[3] = 'l'; + /* The size of the uncompressed record. */ + extra_header[4] = (uncompressed_size & 255); + extra_header[5] = (uncompressed_size >> 8) & 255; + extra_header[6] = (uncompressed_size >> 16) & 255; + extra_header[7] = (uncompressed_size >> 24) & 255; + /* The size of the compressed record. */ + extra_header[8] = (compressed_size & 255); + extra_header[9] = (compressed_size >> 8) & 255; + extra_header[10] = (compressed_size >> 16) & 255; + extra_header[11] = (compressed_size >> 24) & 255; + + /* Write the extra header after the static header. */ + fseek (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET); + fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file); + + /* Done, move back to the end of the file. */ + fflush (warc_current_file); + fseek (warc_current_file, 0, SEEK_END); + } + + return warc_write_ok; +} + + +/* Writes the WARC-Date header for the given timestamp to + the current WARC record. + If timestamp is NULL, the current time will be used. */ +static bool +warc_write_date_header (char *timestamp) +{ + if (timestamp == NULL) + { + char current_timestamp[21]; + warc_timestamp (current_timestamp); + timestamp = current_timestamp; + } + return warc_write_header ("WARC-Date", timestamp); +} + +/* Writes the WARC-IP-Address header for the given IP to + the current WARC record. If IP is NULL, no header will + be written. */ +static bool +warc_write_ip_header (ip_address *ip) +{ + if (ip != NULL) + return warc_write_header ("WARC-IP-Address", print_address (ip)); + else + return warc_write_ok; +} + + +/* warc_sha1_stream_with_payload is a modified copy of sha1_stream + from gnulib/sha1.c. This version calculates two digests in one go. + + Compute SHA1 message digests for bytes read from STREAM. The + digest of the complete file will be written into the 16 bytes + beginning at RES_BLOCK. + + If payload_offset >= 0, a second digest will be calculated of the + portion of the file starting at payload_offset and continuing to + the end of the file. The digest number will be written into the + 16 bytes beginning ad RES_PAYLOAD. */ +static int +warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, long int payload_offset) +{ +#define BLOCKSIZE 32768 + + struct sha1_ctx ctx_block; + struct sha1_ctx ctx_payload; + long int pos; + size_t sum; + + char *buffer = malloc (BLOCKSIZE + 72); + if (!buffer) + return 1; + + /* Initialize the computation context. */ + sha1_init_ctx (&ctx_block); + if (payload_offset >= 0) + sha1_init_ctx (&ctx_payload); + + pos = 0; + + /* Iterate over full file contents. */ + while (1) + { + /* We read the file in blocks of BLOCKSIZE bytes. One call of the + computation function processes the whole buffer so that with the + next round of the loop another block can be read. */ + size_t n; + sum = 0; + + /* Read block. Take care for partial reads. */ + while (1) + { + n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream); + + sum += n; + pos += n; + + if (sum == BLOCKSIZE) + break; + + if (n == 0) + { + /* Check for the error flag IFF N == 0, so that we don't + exit the loop after a partial read due to e.g., EAGAIN + or EWOULDBLOCK. */ + if (ferror (stream)) + { + free (buffer); + return 1; + } + goto process_partial_block; + } + + /* We've read at least one byte, so ignore errors. But always + check for EOF, since feof may be true even though N > 0. + Otherwise, we could end up calling fread after EOF. */ + if (feof (stream)) + goto process_partial_block; + } + + /* Process buffer with BLOCKSIZE bytes. Note that + BLOCKSIZE % 64 == 0 + */ + sha1_process_block (buffer, BLOCKSIZE, &ctx_block); + if (payload_offset >= 0 && payload_offset < pos) + { + /* At least part of the buffer contains data from payload. */ + int start_of_payload = payload_offset - (pos - BLOCKSIZE); + if (start_of_payload <= 0) + /* All bytes in the buffer belong to the payload. */ + start_of_payload = 0; + + /* Process the payload part of the buffer. + Note: we can't use sha1_process_block here even if we + process the complete buffer. Because the payload doesn't + have to start with a full block, there may still be some + bytes left from the previous buffer. Therefore, we need + to continue with sha1_process_bytes. */ + sha1_process_bytes (buffer + start_of_payload, BLOCKSIZE - start_of_payload, &ctx_payload); + } + } + + process_partial_block:; + + /* Process any remaining bytes. */ + if (sum > 0) + { + sha1_process_bytes (buffer, sum, &ctx_block); + if (payload_offset >= 0 && payload_offset < pos) + { + /* At least part of the buffer contains data from payload. */ + int start_of_payload = payload_offset - (pos - sum); + if (start_of_payload <= 0) + /* All bytes in the buffer belong to the payload. */ + start_of_payload = 0; + + /* Process the payload part of the buffer. */ + sha1_process_bytes (buffer + start_of_payload, sum - start_of_payload, &ctx_payload); + } + } + + /* Construct result in desired memory. */ + sha1_finish_ctx (&ctx_block, res_block); + if (payload_offset >= 0) + sha1_finish_ctx (&ctx_payload, res_payload); + free (buffer); + return 0; + +#undef BLOCKSIZE +} + +/* Converts the SHA1 digest to a base32-encoded string. + "sha1:DIGEST\0" (Allocates a new string for the response.) */ +static char * +warc_base32_sha1_digest (char *sha1_digest) +{ + // length: "sha1:" + digest + "\0" + char *sha1_base32 = malloc (BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1 + 5 ); + base32_encode (sha1_digest, SHA1_DIGEST_SIZE, sha1_base32 + 5, BASE32_LENGTH(SHA1_DIGEST_SIZE) + 1); + memcpy (sha1_base32, "sha1:", 5); + sha1_base32[BASE32_LENGTH(SHA1_DIGEST_SIZE) + 5] = '\0'; + return sha1_base32; +} + + +/* Sets the digest headers of the record. + This method will calculate the block digest and, if payload_offset >= 0, + will also calculate the payload digest of the payload starting at the + provided offset. */ +static void +warc_write_digest_headers (FILE *file, long payload_offset) +{ + if (opt.warc_digests_enabled) + { + /* Calculate the block and payload digests. */ + char sha1_res_block[SHA1_DIGEST_SIZE]; + char sha1_res_payload[SHA1_DIGEST_SIZE]; + + rewind (file); + if (warc_sha1_stream_with_payload (file, sha1_res_block, sha1_res_payload, payload_offset) == 0) + { + char *digest; + + digest = warc_base32_sha1_digest (sha1_res_block); + warc_write_header ("WARC-Block-Digest", digest); + free (digest); + + if (payload_offset >= 0) + { + digest = warc_base32_sha1_digest (sha1_res_payload); + warc_write_header ("WARC-Payload-Digest", digest); + free (digest); + } + } + } +} + + +/* Fills timestamp with the current time and date. + The UTC time is formatted following ISO 8601, as required + for use in the WARC-Date header. + The timestamp will be 21 characters long. */ +void +warc_timestamp (char *timestamp) +{ + time_t rawtime; + struct tm * timeinfo; + time ( &rawtime ); + timeinfo = gmtime (&rawtime); + strftime (timestamp, 21, "%Y-%m-%dT%H:%M:%SZ", timeinfo); +} + +/* Fills uuid_str with a UUID based on random numbers. + (See RFC 4122, UUID version 4.) + + Note: this is a fallback method, it is much better to use the + methods provided by libuuid. + + The uuid_str will be 36 characters long. */ +static void +warc_uuid_random (char *uuid_str) +{ + // RFC 4122, a version 4 UUID with only random numbers + + unsigned char uuid_data[16]; + int i; + for (i=0; i<16; i++) + uuid_data[i] = random_number (255); + + // Set the four most significant bits (bits 12 through 15) of the + // time_hi_and_version field to the 4-bit version number + uuid_data[6] = (uuid_data[6] & 0x0F) | 0x40; + + // Set the two most significant bits (bits 6 and 7) of the + // clock_seq_hi_and_reserved to zero and one, respectively. + uuid_data[8] = (uuid_data[8] & 0xBF) | 0x80; + + sprintf (uuid_str, + "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + uuid_data[0], uuid_data[1], uuid_data[2], uuid_data[3], uuid_data[4], + uuid_data[5], uuid_data[6], uuid_data[7], uuid_data[8], uuid_data[9], + uuid_data[10], uuid_data[11], uuid_data[12], uuid_data[13], uuid_data[14], + uuid_data[15]); +} + +/* Fills urn_str with a UUID in the format required + for the WARC-Record-Id header. + The string will be 47 characters long. */ +void +warc_uuid_str (char *urn_str) +{ + char uuid_str[37]; + +# ifdef HAVE_LIBUUID + uuid_t record_id; + uuid_generate (record_id); + uuid_unparse (record_id, uuid_str); +# else + warc_uuid_random (uuid_str); +# endif + + sprintf (urn_str, "", uuid_str); +} + +/* Write a warcinfo record to the current file. + Updates warc_current_warcinfo_uuid_str. */ +bool +warc_write_warcinfo_record (char *filename) +{ + /* Write warc-info record as the first record of the file. */ + /* We add the record id of this info record to the other records in the file. */ + warc_current_warcinfo_uuid_str = (char *) malloc (48); + warc_uuid_str (warc_current_warcinfo_uuid_str); + + char timestamp[22]; + warc_timestamp (timestamp); + + char *filename_copy, *filename_basename; + filename_copy = strdup (filename); + filename_basename = basename (filename_copy); + + warc_write_start_record (); + warc_write_header ("WARC-Type", "warcinfo"); + warc_write_header ("Content-Type", "application/warc-fields"); + warc_write_header ("WARC-Date", timestamp); + warc_write_header ("WARC-Record-ID", warc_current_warcinfo_uuid_str); + warc_write_header ("WARC-Filename", filename_basename); + + /* Create content. */ + FILE *warc_tmp = warc_tempfile (); + if (warc_tmp == NULL) + { + free (filename_copy); + return false; + } + + fprintf (warc_tmp, "software: Wget/%s (%s)\r\n", version_string, OS_TYPE); + fprintf (warc_tmp, "format: WARC File Format 1.0\r\n"); + fprintf (warc_tmp, "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n"); + fprintf (warc_tmp, "robots: %s\r\n", (opt.use_robots ? "classic" : "off")); + fprintf (warc_tmp, "wget-arguments: %s\r\n", program_argstring); + /* Add the user headers, if any. */ + if (opt.warc_user_headers) + { + int i; + for (i = 0; opt.warc_user_headers[i]; i++) + fprintf (warc_tmp, "%s\r\n", opt.warc_user_headers[i]); + } + fprintf(warc_tmp, "\r\n"); + + warc_write_digest_headers (warc_tmp, -1); + warc_write_block_from_file (warc_tmp); + warc_write_end_record (); + + if (! warc_write_ok) + { + logprintf (LOG_NOTQUIET, _("Error writing warcinfo record to WARC file.\n")); + } + + free (filename_copy); + fclose (warc_tmp); + return warc_write_ok; +} + +/* Opens a new WARC file. + If META is true, generates a filename ending with 'meta.warc.gz'. + + This method will: + 1. close the current WARC file (if there is one); + 2. increment warc_current_file_number; + 3. open a new WARC file; + 4. write the initial warcinfo record. + + Returns true on success, false otherwise. + */ +static bool +warc_start_new_file (bool meta) +{ + if (opt.warc_filename == NULL) + return false; + + if (warc_current_file != NULL) + fclose (warc_current_file); + if (warc_current_warcinfo_uuid_str) + free (warc_current_warcinfo_uuid_str); + if (warc_current_filename) + free (warc_current_filename); + + warc_current_file_number++; + + int base_filename_length = strlen (opt.warc_filename); + /* filename format: base + "-" + 5 digit serial number + ".warc.gz" */ + char *new_filename = malloc (base_filename_length + 1 + 5 + 8 + 1); + warc_current_filename = new_filename; + + char *extension = (opt.warc_compression_enabled ? "warc.gz" : "warc"); + + /* If max size is enabled, we add a serial number to the file names. */ + if (meta) + sprintf (new_filename, "%s-meta.%s", opt.warc_filename, extension); + else if (opt.warc_maxsize > 0) + sprintf (new_filename, "%s-%05d.%s", opt.warc_filename, warc_current_file_number, extension); + else + sprintf (new_filename, "%s.%s", opt.warc_filename, extension); + + logprintf (LOG_VERBOSE, _("Opening WARC file %s.\n\n"), quote (new_filename)); + + /* Open the WARC file. */ + warc_current_file = fopen (new_filename, "wb+"); + if (warc_current_file == NULL) + { + logprintf (LOG_NOTQUIET, _("Error opening WARC file %s.\n"), quote (new_filename)); + return false; + } + + if (! warc_write_warcinfo_record (new_filename)) + return false; + + /* Add warcinfo uuid to manifest. */ + if (warc_manifest_fp) + fprintf (warc_manifest_fp, "%s\n", warc_current_warcinfo_uuid_str); + + return true; +} + +/* Opens the CDX file for output. */ +static bool +warc_start_cdx_file () +{ + int filename_length = strlen (opt.warc_filename); + char *cdx_filename = alloca (filename_length + 4 + 1); + memcpy (cdx_filename, opt.warc_filename, filename_length); + memcpy (cdx_filename + filename_length, ".cdx", 5); + warc_current_cdx_file = fopen (cdx_filename, "a+"); + if (warc_current_cdx_file == NULL) + return false; + + /* Print the CDX header. + * + * a - original url + * b - date + * m - mime type + * s - response code + * k - new style checksum + * r - redirect + * M - meta tags + * V - compressed arc file offset + * g - file name + * u - record-id + */ + fprintf (warc_current_cdx_file, " CDX a b a m s k r M V g u\n"); + fflush (warc_current_cdx_file); + + return true; +} + +#define CDX_FIELDSEP " \t\r\n" + +/* Parse the CDX header and find the field numbers of the original url, + checksum and record ID fields. */ +static bool +warc_parse_cdx_header (char *lineptr, int *field_num_original_url, int *field_num_checksum, int *field_num_record_id) +{ + *field_num_original_url = -1; + *field_num_checksum = -1; + *field_num_record_id = -1; + + char *token; + char *save_ptr; + token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr); + + if (token != NULL && strcmp (token, "CDX") == 0) + { + int field_num = 0; + while (token != NULL) + { + token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr); + if (token != NULL) + { + switch (token[0]) + { + case 'a': + *field_num_original_url = field_num; + break; + case 'k': + *field_num_checksum = field_num; + break; + case 'u': + *field_num_record_id = field_num; + break; + } + } + field_num++; + } + } + + return *field_num_original_url != -1 + && *field_num_checksum != -1 + && *field_num_record_id != -1; +} + +/* Parse the CDX record and add it to the warc_cdx_dedup_table hash table. */ +static void +warc_process_cdx_line (char *lineptr, int field_num_original_url, int field_num_checksum, int field_num_record_id) +{ + char *original_url = NULL; + char *checksum = NULL; + char *record_id = NULL; + + char *token; + char *save_ptr; + token = strtok_r (lineptr, CDX_FIELDSEP, &save_ptr); + + /* Read this line to get the fields we need. */ + int field_num = 0; + while (token != NULL) + { + char **val; + if (field_num == field_num_original_url) + val = &original_url; + else if (field_num == field_num_checksum) + val = &checksum; + else if (field_num == field_num_record_id) + val = &record_id; + else + val = NULL; + + if (val != NULL) + *val = strdup (token); + + token = strtok_r (NULL, CDX_FIELDSEP, &save_ptr); + field_num++; + } + + if (original_url != NULL && checksum != NULL && record_id != NULL) + { + /* For some extra efficiency, we decode the base32 encoded + checksum value. This should produce exactly SHA1_DIGEST_SIZE + bytes. */ + size_t checksum_l; + char * checksum_v; + base32_decode_alloc (checksum, strlen (checksum), &checksum_v, &checksum_l); + free (checksum); + + if (checksum_v != NULL && checksum_l == SHA1_DIGEST_SIZE) + { + /* This is a valid line with a valid checksum. */ + struct warc_cdx_record * rec = malloc (sizeof (struct warc_cdx_record)); + rec->url = original_url; + rec->uuid = record_id; + memcpy (rec->digest, checksum_v, SHA1_DIGEST_SIZE); + hash_table_put (warc_cdx_dedup_table, rec->digest, rec); + free (checksum_v); + } + else + { + free (original_url); + if (checksum_v != NULL) + free (checksum_v); + free (record_id); + } + } +} + +/* Loads the CDX file from opt.warc_cdx_dedup_filename and fills + the warc_cdx_dedup_table. */ +bool +warc_load_cdx_dedup_file () +{ + FILE *f = fopen (opt.warc_cdx_dedup_filename, "r"); + if (f == NULL) + return false; + + int field_num_original_url = -1; + int field_num_checksum = -1; + int field_num_record_id = -1; + + char *lineptr = NULL; + size_t n = 0; + size_t line_length; + + /* The first line should contain the CDX header. + Format: " CDX x x x x x" + where x are field type indicators. For our purposes, we only + need 'a' (the original url), 'k' (the SHA1 checksum) and + 'u' (the WARC record id). */ + line_length = getline (&lineptr, &n, f); + if (line_length != -1) + warc_parse_cdx_header (lineptr, &field_num_original_url, &field_num_checksum, &field_num_record_id); + + /* If the file contains all three fields, read the complete file. */ + if (field_num_original_url == -1 + || field_num_checksum == -1 + || field_num_record_id == -1) + { + if (field_num_original_url == -1) + logprintf (LOG_NOTQUIET, _("CDX file does not list original urls. (Missing column 'a'.)\n")); + if (field_num_checksum == -1) + logprintf (LOG_NOTQUIET, _("CDX file does not list checksums. (Missing column 'k'.)\n")); + if (field_num_record_id == -1) + logprintf (LOG_NOTQUIET, _("CDX file does not list record ids. (Missing column 'u'.)\n")); + } + else + { + /* Initialize the table. */ + warc_cdx_dedup_table = hash_table_new (1000, warc_hash_sha1_digest, warc_cmp_sha1_digest); + + do + { + line_length = getline (&lineptr, &n, f); + if (line_length != -1) + warc_process_cdx_line (lineptr, field_num_original_url, field_num_checksum, field_num_record_id); + + } + while (line_length != -1); + + /* Print results. */ + int nrecords = hash_table_count (warc_cdx_dedup_table); + logprintf (LOG_VERBOSE, ngettext ("Loaded %d record from CDX.\n\n", + "Loaded %d records from CDX.\n\n", nrecords), + nrecords); + } + + fclose (f); + + return true; +} +#undef CDX_FIELDSEP + +/* Returns the existing duplicate CDX record for the given url and payload + digest. Returns NULL if the url is not found or if the payload digest + does not match, or if CDX deduplication is disabled. */ +static struct warc_cdx_record * +warc_find_duplicate_cdx_record (char *url, char *sha1_digest_payload) +{ + if (warc_cdx_dedup_table == NULL) + return NULL; + + char *key; + struct warc_cdx_record *rec_existing; + hash_table_get_pair (warc_cdx_dedup_table, sha1_digest_payload, &key, &rec_existing); + + if (rec_existing != NULL && strcmp (rec_existing->url, url) == 0) + return rec_existing; + else + return NULL; +} + +/* Initializes the WARC writer (if opt.warc_filename is set). + This should be called before any WARC record is written. */ +void +warc_init () +{ + warc_write_ok = true; + + if (opt.warc_filename != NULL) + { + if (opt.warc_cdx_dedup_filename != NULL) + { + if (! warc_load_cdx_dedup_file ()) + { + logprintf (LOG_NOTQUIET, + _("Could not read CDX file %s for deduplication.\n"), + quote (opt.warc_cdx_dedup_filename)); + exit(1); + } + } + + warc_manifest_fp = warc_tempfile (); + if (warc_manifest_fp == NULL) + { + logprintf (LOG_NOTQUIET, _("Could not open temporary WARC manifest file.\n")); + exit(1); + } + + if (opt.warc_keep_log) + { + warc_log_fp = warc_tempfile (); + if (warc_log_fp == NULL) + { + logprintf (LOG_NOTQUIET, _("Could not open temporary WARC log file.\n")); + exit(1); + } + log_set_warc_log_fp (warc_log_fp); + } + + warc_current_file_number = -1; + if (! warc_start_new_file (false)) + { + logprintf (LOG_NOTQUIET, _("Could not open WARC file.\n")); + exit(1); + } + + if (opt.warc_cdx_enabled) + { + if (! warc_start_cdx_file ()) + { + logprintf (LOG_NOTQUIET, _("Could not open CDX file for output.\n")); + exit(1); + } + } + } +} + +/* Writes metadata (manifest, configuration, log file) to the WARC file. */ +void +warc_write_metadata () +{ + /* If there are multiple WARC files, the metadata should be written to a separate file. */ + if (opt.warc_maxsize > 0) + warc_start_new_file (true); + + char manifest_uuid [48]; + warc_uuid_str (manifest_uuid); + + fflush (warc_manifest_fp); + warc_write_resource_record (manifest_uuid, + "metadata://gnu.org/software/wget/warc/MANIFEST.txt", + NULL, NULL, NULL, "text/plain", + warc_manifest_fp, -1); + /* warc_write_resource_record has closed warc_manifest_fp. */ + + FILE * warc_tmp_fp = warc_tempfile (); + if (warc_tmp_fp == NULL) + { + logprintf (LOG_NOTQUIET, _("Could not open temporary WARC file.\n")); + exit(1); + } + fflush (warc_tmp_fp); + fprintf (warc_tmp_fp, "%s\n", program_argstring); + + warc_write_resource_record (manifest_uuid, + "metadata://gnu.org/software/wget/warc/wget_arguments.txt", + NULL, NULL, NULL, "text/plain", + warc_tmp_fp, -1); + /* warc_write_resource_record has closed warc_tmp_fp. */ + + if (warc_log_fp != NULL) + { + warc_write_resource_record (NULL, + "metadata://gnu.org/software/wget/warc/wget.log", + NULL, manifest_uuid, NULL, "text/plain", + warc_log_fp, -1); + /* warc_write_resource_record has closed warc_log_fp. */ + + warc_log_fp = NULL; + log_set_warc_log_fp (NULL); + } +} + +/* Finishes the WARC writing. + This should be called at the end of the program. */ +void +warc_close () +{ + if (warc_current_file != NULL) + { + warc_write_metadata (); + free (warc_current_warcinfo_uuid_str); + fclose (warc_current_file); + } + if (warc_current_cdx_file != NULL) + fclose (warc_current_cdx_file); + if (warc_log_fp != NULL) + { + fclose (warc_log_fp); + log_set_warc_log_fp (NULL); + } +} + +/* Creates a temporary file for writing WARC output. + The temporary file will be created in opt.warc_tempdir. + Returns the pointer to the temporary file, or NULL. */ +FILE * +warc_tempfile () +{ + char filename[100]; + if (path_search (filename, 100, opt.warc_tempdir, "wget", true) == -1) + return NULL; + + int fd = mkstemp (filename); + if (fd < 0) + return NULL; + + if (unlink (filename) < 0) + return NULL; + + return fdopen (fd, "wb+"); +} + + +/* Writes a request record to the WARC file. + url is the target uri of the request, + timestamp_str is the timestamp of the request (generated with warc_timestamp), + record_uuid is the uuid of the request (generated with warc_uuid_str), + body is a pointer to a file containing the request headers and body. + ip is the ip address of the server (or NULL), + Calling this function will close body. + Returns true on success, false on error. */ +bool +warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, long int payload_offset) +{ + warc_write_start_record (); + warc_write_header ("WARC-Type", "request"); + warc_write_header ("WARC-Target-URI", url); + warc_write_header ("Content-Type", "application/http;msgtype=request"); + warc_write_date_header (timestamp_str); + warc_write_header ("WARC-Record-ID", record_uuid); + warc_write_ip_header (ip); + warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str); + warc_write_digest_headers (body, payload_offset); + warc_write_block_from_file (body); + warc_write_end_record (); + + fclose (body); + + return warc_write_ok; +} + +/* Writes a response record to the CDX file. + url is the target uri of the request/response, + timestamp_str is the timestamp of the request that generated this response, + (generated with warc_timestamp), + mime_type is the mime type of the response body (will be printed to CDX), + response_code is the HTTP response code (will be printed to CDX), + payload_digest is the sha1 digest of the payload, + redirect_location is the contents of the Location: header, or NULL (will be printed to CDX), + offset is the position of the WARC record in the WARC file, + warc_filename is the filename of the WARC, + response_uuid is the uuid of the response. + Returns true on success, false on error. */ +static bool +warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, size_t offset, char *warc_filename, char *response_uuid) +{ + /* Transform the timestamp. */ + char timestamp_str_cdx [15]; + memcpy (timestamp_str_cdx , timestamp_str , 4); /* "YYYY" "-" */ + memcpy (timestamp_str_cdx + 4, timestamp_str + 5, 2); /* "mm" "-" */ + memcpy (timestamp_str_cdx + 6, timestamp_str + 8, 2); /* "dd" "T" */ + memcpy (timestamp_str_cdx + 8, timestamp_str + 11, 2); /* "HH" ":" */ + memcpy (timestamp_str_cdx + 10, timestamp_str + 14, 2); /* "MM" ":" */ + memcpy (timestamp_str_cdx + 12, timestamp_str + 17, 2); /* "SS" "Z" */ + timestamp_str_cdx[14] = '\0'; + + /* Rewrite the checksum. */ + char *checksum; + if (payload_digest != NULL) + checksum = payload_digest + 5; /* Skip the "sha1:" */ + else + checksum = "-"; + + if (mime_type == NULL || strlen(mime_type) == 0) + mime_type = "-"; + if (redirect_location == NULL || strlen(redirect_location) == 0) + redirect_location = "-"; + + /* Print the CDX line. */ + fprintf (warc_current_cdx_file, "%s %s %s %s %d %s %s - %ld %s %s\n", url, timestamp_str_cdx, url, mime_type, response_code, checksum, redirect_location, offset, warc_current_filename, response_uuid); + fflush (warc_current_cdx_file); + + return true; +} + +/* Writes a revisit record to the WARC file. + url is the target uri of the request/response, + timestamp_str is the timestamp of the request that generated this response + (generated with warc_timestamp), + concurrent_to_uuid is the uuid of the request for that generated this response + (generated with warc_uuid_str), + refers_to_uuid is the uuid of the original response + (generated with warc_uuid_str), + payload_digest is the sha1 digest of the payload, + ip is the ip address of the server (or NULL), + body is a pointer to a file containing the response headers (without payload). + Calling this function will close body. + Returns true on success, false on error. */ +static bool +warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_uuid, char *payload_digest, char *refers_to, ip_address *ip, FILE *body) +{ + char revisit_uuid [48]; + warc_uuid_str (revisit_uuid); + + char *block_digest = NULL; + char sha1_res_block[SHA1_DIGEST_SIZE]; + sha1_stream (body, sha1_res_block); + block_digest = warc_base32_sha1_digest (sha1_res_block); + + warc_write_start_record (); + warc_write_header ("WARC-Type", "revisit"); + warc_write_header ("WARC-Record-ID", revisit_uuid); + warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str); + warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid); + warc_write_header ("WARC-Refers-To", refers_to); + warc_write_header ("WARC-Profile", "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest"); + warc_write_header ("WARC-Truncated", "length"); + warc_write_header ("WARC-Target-URI", url); + warc_write_date_header (timestamp_str); + warc_write_ip_header (ip); + warc_write_header ("Content-Type", "application/http;msgtype=response"); + warc_write_header ("WARC-Block-Digest", block_digest); + warc_write_header ("WARC-Payload-Digest", payload_digest); + warc_write_block_from_file (body); + warc_write_end_record (); + + fclose (body); + free (block_digest); + + return warc_write_ok; +} + +/* Writes a response record to the WARC file. + url is the target uri of the request/response, + timestamp_str is the timestamp of the request that generated this response + (generated with warc_timestamp), + concurrent_to_uuid is the uuid of the request for that generated this response + (generated with warc_uuid_str), + ip is the ip address of the server (or NULL), + body is a pointer to a file containing the response headers and body. + mime_type is the mime type of the response body (will be printed to CDX), + response_code is the HTTP response code (will be printed to CDX), + redirect_location is the contents of the Location: header, or NULL (will be printed to CDX), + Calling this function will close body. + Returns true on success, false on error. */ +bool +warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location) +{ + char *block_digest = NULL; + char *payload_digest = NULL; + char sha1_res_block[SHA1_DIGEST_SIZE]; + char sha1_res_payload[SHA1_DIGEST_SIZE]; + + if (opt.warc_digests_enabled) + { + /* Calculate the block and payload digests. */ + rewind (body); + if (warc_sha1_stream_with_payload (body, sha1_res_block, sha1_res_payload, payload_offset) == 0) + { + /* Decide (based on url + payload digest) if we have seen this + data before. */ + struct warc_cdx_record *rec_existing = warc_find_duplicate_cdx_record (url, sha1_res_payload); + if (rec_existing != NULL) + { + /* Found an existing record. */ + logprintf (LOG_VERBOSE, _("Found exact match in CDX file. Saving revisit record to WARC.\n")); + + /* Remove the payload from the file. */ + if (payload_offset > 0) + { + if (ftruncate (fileno (body), payload_offset) == -1) + return false; + } + + /* Send the original payload digest. */ + payload_digest = warc_base32_sha1_digest (sha1_res_payload); + bool result = warc_write_revisit_record (url, timestamp_str, concurrent_to_uuid, payload_digest, rec_existing->uuid, ip, body); + free (payload_digest); + + return result; + } + + block_digest = warc_base32_sha1_digest (sha1_res_block); + payload_digest = warc_base32_sha1_digest (sha1_res_payload); + } + } + + /* Not a revisit, just store the record. */ + + char response_uuid [48]; + warc_uuid_str (response_uuid); + + fseek (warc_current_file, 0L, SEEK_END); + size_t offset = ftell (warc_current_file); + + warc_write_start_record (); + warc_write_header ("WARC-Type", "response"); + warc_write_header ("WARC-Record-ID", response_uuid); + warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str); + warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid); + warc_write_header ("WARC-Target-URI", url); + warc_write_date_header (timestamp_str); + warc_write_ip_header (ip); + warc_write_header ("WARC-Block-Digest", block_digest); + warc_write_header ("WARC-Payload-Digest", payload_digest); + warc_write_header ("Content-Type", "application/http;msgtype=response"); + warc_write_block_from_file (body); + warc_write_end_record (); + + fclose (body); + + if (warc_write_ok && opt.warc_cdx_enabled) + { + /* Add this record to the CDX. */ + warc_write_cdx_record (url, timestamp_str, mime_type, response_code, payload_digest, redirect_location, offset, warc_current_filename, response_uuid); + } + + if (block_digest) + free (block_digest); + if (payload_digest) + free (payload_digest); + + return warc_write_ok; +} + +/* Writes a resource record to the WARC file. + resource_uuid is the uuid of the resource (or NULL), + url is the target uri of the resource, + timestamp_str is the timestamp (generated with warc_timestamp), + concurrent_to_uuid is the uuid of the request for that generated this resource + (generated with warc_uuid_str) or NULL, + ip is the ip address of the server (or NULL), + content_type is the mime type of the body (or NULL), + body is a pointer to a file containing the resource data. + Calling this function will close body. + Returns true on success, false on error. */ +bool +warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset) +{ + if (resource_uuid == NULL) + { + resource_uuid = alloca (48); + warc_uuid_str (resource_uuid); + } + + if (content_type == NULL) + content_type = "application/octet-stream"; + + warc_write_start_record (); + warc_write_header ("WARC-Type", "resource"); + warc_write_header ("WARC-Record-ID", resource_uuid); + warc_write_header ("WARC-Warcinfo-ID", warc_current_warcinfo_uuid_str); + warc_write_header ("WARC-Concurrent-To", concurrent_to_uuid); + warc_write_header ("WARC-Target-URI", url); + warc_write_date_header (timestamp_str); + warc_write_ip_header (ip); + warc_write_digest_headers (body, payload_offset); + warc_write_header ("Content-Type", content_type); + warc_write_block_from_file (body); + warc_write_end_record (); + + fclose (body); + + return warc_write_ok; +} + diff --git a/src/warc.h b/src/warc.h new file mode 100644 index 00000000..2ade2a8b --- /dev/null +++ b/src/warc.h @@ -0,0 +1,19 @@ +/* Declarations of WARC helper methods. */ +#ifndef WARC_H +#define WARC_H + +#include "host.h" + +void warc_init (); +void warc_close (); +void warc_timestamp (char *timestamp); +void warc_uuid_str (char *id_str); + +FILE * warc_tempfile (); + +bool warc_write_request_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset); +bool warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location); +bool warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset); + +#endif /* WARC_H */ + diff --git a/src/wget.h b/src/wget.h index c7c5e2cb..ee315b6f 100644 --- a/src/wget.h +++ b/src/wget.h @@ -353,7 +353,9 @@ typedef enum PROXERR, /* 50 */ AUTHFAILED, QUOTEXC, WRITEFAILED, SSLINITFAILED, VERIFCERTERR, - UNLINKERR, NEWLOCATION_KEEP_POST + UNLINKERR, NEWLOCATION_KEEP_POST, + + WARC_ERR, WARC_TMP_FOPENERR, WARC_TMP_FWRITEERR } uerr_t; /* 2005-02-19 SMS.