1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

Add support for WARC files.

This commit is contained in:
Gijs van Tulder 2011-11-04 22:25:00 +01:00 committed by Giuseppe Scrivano
parent a5fdba0958
commit e3820953b2
17 changed files with 2048 additions and 95 deletions

View File

@ -28,6 +28,7 @@ gnulib_modules="
accept
alloca
announce-gen
base32
bind
c-ctype
clock-time
@ -49,6 +50,7 @@ maintainer-makefile
mbtowc
mkdir
crypto/md5
crypto/sha1
pipe
quote
quotearg
@ -63,6 +65,7 @@ socket
stdbool
strcasestr
strerror_r-posix
tmpdir
unlocked-io
update-copyright
vasprintf

View File

@ -511,7 +511,19 @@ if test "X$iri" != "Xno"; then
fi
fi
dnl
dnl Check for UUID
dnl
AC_CHECK_HEADER(uuid/uuid.h,
AC_CHECK_LIB(uuid, uuid_generate,
[LIBS="${LIBS} -luuid"
AC_DEFINE([HAVE_LIBUUID], 1,
[Define if libuuid is available.])
])
)
dnl Needed by src/Makefile.am
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])

View File

@ -1,3 +1,6 @@
2011-11-04 Giuseppe Scrivano <gscrivano@gnu.org>
2011-10-07 Steven Schweda <address@hidden>
* connect.c: Add HAVE_SYS_SELECT_H and HAVE_SYS_SOCKET_H conditions
@ -21,7 +24,10 @@
* openssl.c (ssl_init): Add type cast (SSL_METHOD *) to newly "const"
"meth" argument to accommodate OpenSSL version 0.9.8, where that
argument is not "const" in the OpenSSL function (SSL_CTX_new).
* test.c: Declare "program_argstring".
* utils.c (fopen_excl): Comment typography.
* warc.h: New file.
* warc.c: New file.
2011-10-02 Henrik Holst <henrik.holst@millistream.com> (tiny change)
* http.c (gethttp): If 'contentonerror' is used then do not

View File

@ -46,13 +46,13 @@ wget_SOURCES = cmpt.c connect.c convert.c cookies.c ftp.c \
css_.c css-url.c \
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
recur.c res.c retr.c spider.c url.c \
recur.c res.c retr.c spider.c url.c warc.c \
utils.c exits.c build_info.c $(IRI_OBJ) \
css-url.h css-tokens.h connect.h convert.h cookies.h \
ftp.h hash.h host.h html-parse.h html-url.h \
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
options.h progress.h ptimer.h recur.h res.h retr.h \
spider.h ssl.h sysdep.h url.h utils.h wget.h iri.h \
spider.h ssl.h sysdep.h url.h warc.h utils.h wget.h iri.h \
exits.h gettext.h
nodist_wget_SOURCES = version.c
EXTRA_wget_SOURCES = iri.c

View File

@ -49,6 +49,7 @@ as that of the covered work. */
#include "netrc.h"
#include "convert.h" /* for downloaded_file */
#include "recur.h" /* for INFINITE_RECURSION */
#include "warc.h"
#ifdef __VMS
# include "vms.h"
@ -237,10 +238,11 @@ static uerr_t ftp_get_listing (struct url *, ccon *, struct fileinfo **);
/* Retrieves a file with denoted parameters through opening an FTP
connection to the server. It always closes the data connection,
and closes the control connection in case of error. */
and closes the control connection in case of error. If warc_tmp
is non-NULL, the downloaded data will be written there as well. */
static uerr_t
getftp (struct url *u, wgint passed_expected_bytes, wgint *qtyread,
wgint restval, ccon *con, int count)
wgint restval, ccon *con, int count, FILE *warc_tmp)
{
int csock, dtsock, local_sock, res;
uerr_t err = RETROK; /* appease the compiler */
@ -1155,7 +1157,7 @@ Error in server response, closing control connection.\n"));
/* 2011-09-30 SMS.
Added listing files to the set of non-"binary" (text, Stream_LF)
files. (Wget works either way, but other programs, like, say, text
editors, work better on listing files which have text attributes.)
editors, work better on listing files which have text attributes.)
Now we use "binary" attributes for a binary ("IMAGE") transfer,
unless "--ftp-stmlf" was specified, and we always use non-"binary"
(text, Stream_LF) attributes for a listing file, or for an ASCII
@ -1194,7 +1196,7 @@ Error in server response, closing control connection.\n"));
}
else if (opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct
|| opt.output_document || count > 0)
{
{
if (opt.unlink && file_exists_p (con->target))
{
int res = unlink (con->target);
@ -1274,7 +1276,7 @@ Error in server response, closing control connection.\n"));
rd_size = 0;
res = fd_read_body (dtsock, fp,
expected_bytes ? expected_bytes - restval : 0,
restval, &rd_size, qtyread, &con->dltime, flags);
restval, &rd_size, qtyread, &con->dltime, flags, warc_tmp);
tms = datetime_str (time (NULL));
tmrate = retr_rate (rd_size, con->dltime);
@ -1285,15 +1287,18 @@ Error in server response, closing control connection.\n"));
if (!output_stream || con->cmd & DO_LIST)
fclose (fp);
/* If fd_read_body couldn't write to fp, bail out. */
if (res == -2)
/* If fd_read_body couldn't write to fp or warc_tmp, bail out. */
if (res == -2 || (warc_tmp != NULL && res == -3))
{
logprintf (LOG_NOTQUIET, _("%s: %s, closing control connection.\n"),
con->target, strerror (errno));
fd_close (csock);
con->csock = -1;
fd_close (dtsock);
return FWRITEERR;
if (res == -2)
return FWRITEERR;
else if (res == -3)
return WARC_TMP_FWRITEERR;
}
else if (res == -1)
{
@ -1409,6 +1414,11 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
uerr_t err;
struct_stat st;
/* Declare WARC variables. */
bool warc_enabled = (opt.warc_filename != NULL);
FILE *warc_tmp = NULL;
ip_address *warc_ip = NULL;
/* Get the target, and set the name for the message accordingly. */
if ((f == NULL) && (con->target))
{
@ -1445,6 +1455,21 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
orig_lp = con->cmd & LEAVE_PENDING ? 1 : 0;
/* For file RETR requests, we can write a WARC record.
We record the file contents to a temporary file. */
if (warc_enabled && (con->cmd & DO_RETR))
{
warc_tmp = warc_tempfile ();
if (warc_tmp == NULL)
return WARC_TMP_FOPENERR;
if (!con->proxy && con->csock != -1)
{
warc_ip = (ip_address *) alloca (sizeof (ip_address));
socket_ip_address (con->csock, warc_ip, ENDPOINT_PEER);
}
}
/* THE loop. */
do
{
@ -1509,7 +1534,10 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
len = f->size;
else
len = 0;
err = getftp (u, len, &qtyread, restval, con, count);
/* If we are working on a WARC record, getftp should also write
to the warc_tmp file. */
err = getftp (u, len, &qtyread, restval, con, count, warc_tmp);
if (con->csock == -1)
con->st &= ~DONE_CWD;
@ -1520,8 +1548,10 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
{
case HOSTERR: case CONIMPOSSIBLE: case FWRITEERR: case FOPENERR:
case FTPNSFOD: case FTPLOGINC: case FTPNOPASV: case CONTNOTSUPPORTED:
case UNLINKERR:
case UNLINKERR: case WARC_TMP_FWRITEERR:
/* Fatal errors, give up. */
if (warc_tmp != NULL)
fclose (warc_tmp);
return err;
case CONSOCKERR: case CONERROR: case FTPSRVERR: case FTPRERR:
case WRITEFAILED: case FTPUNKNOWNTYPE: case FTPSYSERR:
@ -1589,6 +1619,19 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
xfree (hurl);
}
if (warc_enabled && (con->cmd & DO_RETR))
{
/* Create and store a WARC resource record for the retrieved file. */
bool warc_res;
warc_res = warc_write_resource_record (NULL, u->url, NULL, NULL,
warc_ip, NULL, warc_tmp, -1);
if (! warc_res)
return WARC_ERR;
/* warc_write_resource_record has also closed warc_tmp. */
}
if ((con->cmd & DO_LIST))
/* This is a directory listing file. */
{
@ -1928,7 +1971,9 @@ Already have correct symlink %s -> %s\n\n"),
xfree (ofile);
/* Break on fatals. */
if (err == QUOTEXC || err == HOSTERR || err == FWRITEERR)
if (err == QUOTEXC || err == HOSTERR || err == FWRITEERR
|| err == WARC_ERR || err == WARC_TMP_FOPENERR
|| err == WARC_TMP_FWRITEERR)
break;
con->cmd &= ~ (DO_CWD | DO_LOGIN);
f = f->next;

View File

@ -58,6 +58,7 @@ as that of the covered work. */
#include "md5.h"
#include "convert.h"
#include "spider.h"
#include "warc.h"
#ifdef TESTING
#include "test.h"
@ -320,10 +321,12 @@ request_remove_header (struct request *req, char *name)
p += A_len; \
} while (0)
/* Construct the request and write it to FD using fd_write. */
/* Construct the request and write it to FD using fd_write.
If warc_tmp is set to a file pointer, the request string will
also be written to that file. */
static int
request_send (const struct request *req, int fd)
request_send (const struct request *req, int fd, FILE *warc_tmp)
{
char *request_string, *p;
int i, size, write_error;
@ -374,6 +377,13 @@ request_send (const struct request *req, int fd)
if (write_error < 0)
logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
fd_errstr (fd));
else if (warc_tmp != NULL)
{
/* Write a copy of the data to the WARC record. */
int warc_tmp_written = fwrite (request_string, 1, size - 1, warc_tmp);
if (warc_tmp_written != size - 1)
return -2;
}
return write_error;
}
@ -444,10 +454,12 @@ register_basic_auth_host (const char *hostname)
/* Send the contents of FILE_NAME to SOCK. Make sure that exactly
PROMISED_SIZE bytes are sent over the wire -- if the file is
longer, read only that much; if the file is shorter, report an error. */
longer, read only that much; if the file is shorter, report an error.
If warc_tmp is set to a file pointer, the post data will
also be written to that file. */
static int
post_file (int sock, const char *file_name, wgint promised_size)
post_file (int sock, const char *file_name, wgint promised_size, FILE *warc_tmp)
{
static char chunk[8192];
wgint written = 0;
@ -472,6 +484,16 @@ post_file (int sock, const char *file_name, wgint promised_size)
fclose (fp);
return -1;
}
if (warc_tmp != NULL)
{
/* Write a copy of the data to the WARC record. */
int warc_tmp_written = fwrite (chunk, 1, towrite, warc_tmp);
if (warc_tmp_written != towrite)
{
fclose (fp);
return -2;
}
}
written += towrite;
}
fclose (fp);
@ -1462,6 +1484,135 @@ File %s already there; not retrieving.\n\n"), quote (filename));
*dt |= TEXTHTML;
}
/* Download the response body from the socket and writes it to
an output file. The headers have already been read from the
socket. If WARC is enabled, the response body will also be
written to a WARC response record.
hs, contlen, contrange, chunked_transfer_encoding and url are
parameters from the gethttp method. fp is a pointer to the
output file.
url, warc_timestamp_str, warc_request_uuid, warc_ip, type
and statcode will be saved in the headers of the WARC record.
The head parameter contains the HTTP headers of the response.
If fp is NULL and WARC is enabled, the response body will be
written only to the WARC file. If WARC is disabled and fp
is a file pointer, the data will be written to the file.
If fp is a file pointer and WARC is enabled, the body will
be written to both destinations.
Returns the error code. */
static int
read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen,
wgint contrange, bool chunked_transfer_encoding,
char *url, char *warc_timestamp_str, char *warc_request_uuid,
ip_address *warc_ip, char *type, int statcode, char *head)
{
int warc_payload_offset = 0;
FILE *warc_tmp = NULL;
int warcerr = 0;
if (opt.warc_filename != NULL)
{
/* Open a temporary file where we can write the response before we
add it to the WARC record. */
warc_tmp = warc_tempfile ();
if (warc_tmp == NULL)
warcerr = WARC_TMP_FOPENERR;
if (warcerr == 0)
{
/* We should keep the response headers for the WARC record. */
int head_len = strlen (head);
int warc_tmp_written = fwrite (head, 1, head_len, warc_tmp);
if (warc_tmp_written != head_len)
warcerr = WARC_TMP_FWRITEERR;
warc_payload_offset = head_len;
}
if (warcerr != 0)
{
if (warc_tmp != NULL)
fclose (warc_tmp);
return warcerr;
}
}
if (fp != NULL)
{
/* This confuses the timestamping code that checks for file size.
#### The timestamping code should be smarter about file size. */
if (opt.save_headers && hs->restval == 0)
fwrite (head, 1, strlen (head), fp);
}
/* Read the response body. */
int flags = 0;
if (contlen != -1)
/* If content-length is present, read that much; otherwise, read
until EOF. The HTTP spec doesn't require the server to
actually close the connection when it's done sending data. */
flags |= rb_read_exactly;
if (fp != NULL && hs->restval > 0 && contrange == 0)
/* If the server ignored our range request, instruct fd_read_body
to skip the first RESTVAL bytes of body. */
flags |= rb_skip_startpos;
if (chunked_transfer_encoding)
flags |= rb_chunked_transfer_encoding;
hs->len = hs->restval;
hs->rd_size = 0;
/* Download the response body and write it to fp.
If we are working on a WARC file, we simultaneously write the
response body to warc_tmp. */
hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0,
hs->restval, &hs->rd_size, &hs->len, &hs->dltime,
flags, warc_tmp);
if (hs->res >= 0)
{
if (warc_tmp != NULL)
{
/* Create a response record and write it to the WARC file.
Note: per the WARC standard, the request and response should share
the same date header. We re-use the timestamp of the request.
The response record should also refer to the uuid of the request. */
bool r = warc_write_response_record (url, warc_timestamp_str,
warc_request_uuid, warc_ip,
warc_tmp, warc_payload_offset,
type, statcode, hs->newloc);
/* warc_write_response_record has closed warc_tmp. */
if (! r)
return WARC_ERR;
}
return RETRFINISHED;
}
if (warc_tmp != NULL)
fclose (warc_tmp);
if (hs->res == -2)
{
/* Error while writing to fd. */
return FWRITEERR;
}
else if (hs->res == -3)
{
/* Error while writing to warc_tmp. */
return WARC_TMP_FWRITEERR;
}
else
{
/* A read error! */
hs->rderrmsg = xstrdup (fd_errstr (sock));
return RETRFINISHED;
}
}
#define BEGINS_WITH(line, string_constant) \
(!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \
&& (c_isspace (line[sizeof (string_constant) - 1]) \
@ -1519,9 +1670,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
wgint contlen, contrange;
struct url *conn;
FILE *fp;
int err;
int sock = -1;
int flags;
/* Set to 1 when the authorization has already been sent and should
not be tried again. */
@ -1547,6 +1698,14 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
char hdrval[256];
char *message;
/* Declare WARC variables. */
bool warc_enabled = (opt.warc_filename != NULL);
FILE *warc_tmp = NULL;
char warc_timestamp_str [21];
char warc_request_uuid [48];
ip_address *warc_ip = NULL;
long int warc_payload_offset = -1;
/* Whether this connection will be kept alive after the HTTP request
is done. */
bool keep_alive;
@ -1852,7 +2011,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
that the contents of Host would be exactly the same as
the contents of CONNECT. */
write_error = request_send (connreq, sock);
write_error = request_send (connreq, sock, 0);
request_free (connreq);
if (write_error < 0)
{
@ -1924,8 +2083,26 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
#endif /* HAVE_SSL */
}
/* Open the temporary file where we will write the request. */
if (warc_enabled)
{
warc_tmp = warc_tempfile ();
if (warc_tmp == NULL)
{
CLOSE_INVALIDATE (sock);
request_free (req);
return WARC_TMP_FOPENERR;
}
if (! proxy)
{
warc_ip = (ip_address *) alloca (sizeof (ip_address));
socket_ip_address (sock, warc_ip, ENDPOINT_PEER);
}
}
/* Send the request to server. */
write_error = request_send (req, sock);
write_error = request_send (req, sock, warc_tmp);
if (write_error >= 0)
{
@ -1933,16 +2110,39 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
{
DEBUGP (("[POST data: %s]\n", opt.post_data));
write_error = fd_write (sock, opt.post_data, post_data_size, -1);
if (write_error >= 0 && warc_tmp != NULL)
{
/* Remember end of headers / start of payload. */
warc_payload_offset = ftell (warc_tmp);
/* Write a copy of the data to the WARC record. */
int warc_tmp_written = fwrite (opt.post_data, 1, post_data_size, warc_tmp);
if (warc_tmp_written != post_data_size)
write_error = -2;
}
}
else if (opt.post_file_name && post_data_size != 0)
write_error = post_file (sock, opt.post_file_name, post_data_size);
{
if (warc_tmp != NULL)
/* Remember end of headers / start of payload. */
warc_payload_offset = ftell (warc_tmp);
write_error = post_file (sock, opt.post_file_name, post_data_size, warc_tmp);
}
}
if (write_error < 0)
{
CLOSE_INVALIDATE (sock);
request_free (req);
return WRITEFAILED;
if (warc_tmp != NULL)
fclose (warc_tmp);
if (write_error == -2)
return WARC_TMP_FWRITEERR;
else
return WRITEFAILED;
}
logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
proxy ? "Proxy" : "HTTP");
@ -1950,6 +2150,29 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
contrange = 0;
*dt &= ~RETROKF;
if (warc_enabled)
{
bool warc_result;
/* Generate a timestamp and uuid for this request. */
warc_timestamp (warc_timestamp_str);
warc_uuid_str (warc_request_uuid);
/* Create a request record and store it in the WARC file. */
warc_result = warc_write_request_record (u->url, warc_timestamp_str,
warc_request_uuid, warc_ip,
warc_tmp, warc_payload_offset);
if (! warc_result)
{
CLOSE_INVALIDATE (sock);
request_free (req);
return WARC_ERR;
}
/* warc_write_request_record has also closed warc_tmp. */
}
read_header:
head = read_http_response_head (sock);
if (!head)
@ -2073,11 +2296,42 @@ read_header:
if (statcode == HTTP_STATUS_UNAUTHORIZED)
{
/* Authorization is required. */
if (keep_alive && !head_only
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
CLOSE_FINISH (sock);
/* Normally we are not interested in the response body.
But if we are writing a WARC file we are: we like to keep everyting. */
if (warc_enabled)
{
int err;
type = resp_header_strdup (resp, "Content-Type");
err = read_response_body (hs, sock, NULL, contlen, 0,
chunked_transfer_encoding,
u->url, warc_timestamp_str,
warc_request_uuid, warc_ip, type,
statcode, head);
xfree_null (type);
if (err != RETRFINISHED || hs->res < 0)
{
CLOSE_INVALIDATE (sock);
request_free (req);
xfree_null (message);
resp_free (resp);
xfree (head);
return err;
}
else
CLOSE_FINISH (sock);
}
else
CLOSE_INVALIDATE (sock);
{
/* Since WARC is disabled, we are not interested in the response body. */
if (keep_alive && !head_only
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
CLOSE_FINISH (sock);
else
CLOSE_INVALIDATE (sock);
}
pconn.authorized = false;
if (!auth_finished && (user && passwd))
{
@ -2325,11 +2579,42 @@ read_header:
_("Location: %s%s\n"),
hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"),
hs->newloc ? _(" [following]") : "");
if (keep_alive && !head_only
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
CLOSE_FINISH (sock);
/* In case the caller cares to look... */
hs->len = 0;
hs->res = 0;
hs->restval = 0;
/* Normally we are not interested in the response body of a redirect.
But if we are writing a WARC file we are: we like to keep everyting. */
if (warc_enabled)
{
int err = read_response_body (hs, sock, NULL, contlen, 0,
chunked_transfer_encoding,
u->url, warc_timestamp_str,
warc_request_uuid, warc_ip, type,
statcode, head);
if (err != RETRFINISHED || hs->res < 0)
{
CLOSE_INVALIDATE (sock);
xfree_null (type);
xfree (head);
return err;
}
else
CLOSE_FINISH (sock);
}
else
CLOSE_INVALIDATE (sock);
{
/* Since WARC is disabled, we are not interested in the response body. */
if (keep_alive && !head_only
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
CLOSE_FINISH (sock);
else
CLOSE_INVALIDATE (sock);
}
xfree_null (type);
xfree (head);
/* From RFC2616: The status codes 303 and 307 have
@ -2447,8 +2732,6 @@ read_header:
logputs (LOG_VERBOSE, "\n");
}
}
xfree_null (type);
type = NULL; /* We don't need it any more. */
/* Return if we have no intention of further downloading. */
if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only)
@ -2456,21 +2739,48 @@ read_header:
/* In case the caller cares to look... */
hs->len = 0;
hs->res = 0;
xfree_null (type);
if (head_only)
/* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the
servers not to send body in response to a HEAD request, and
those that do will likely be caught by test_socket_open.
If not, they can be worked around using
`--no-http-keep-alive'. */
CLOSE_FINISH (sock);
else if (keep_alive
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
/* Successfully skipped the body; also keep using the socket. */
CLOSE_FINISH (sock);
hs->restval = 0;
/* Normally we are not interested in the response body of a error responses.
But if we are writing a WARC file we are: we like to keep everyting. */
if (warc_enabled)
{
int err = read_response_body (hs, sock, NULL, contlen, 0,
chunked_transfer_encoding,
u->url, warc_timestamp_str,
warc_request_uuid, warc_ip, type,
statcode, head);
if (err != RETRFINISHED || hs->res < 0)
{
CLOSE_INVALIDATE (sock);
xfree (head);
xfree_null (type);
return err;
}
else
CLOSE_FINISH (sock);
}
else
CLOSE_INVALIDATE (sock);
{
/* Since WARC is disabled, we are not interested in the response body. */
if (head_only)
/* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the
servers not to send body in response to a HEAD request, and
those that do will likely be caught by test_socket_open.
If not, they can be worked around using
`--no-http-keep-alive'. */
CLOSE_FINISH (sock);
else if (keep_alive
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
/* Successfully skipped the body; also keep using the socket. */
CLOSE_FINISH (sock);
else
CLOSE_INVALIDATE (sock);
}
xfree (head);
xfree_null (type);
return RETRFINISHED;
}
@ -2512,6 +2822,7 @@ read_header:
strerror (errno));
CLOSE_INVALIDATE (sock);
xfree (head);
xfree_null (type);
return UNLINKERR;
}
}
@ -2539,6 +2850,7 @@ read_header:
hs->local_file);
CLOSE_INVALIDATE (sock);
xfree (head);
xfree_null (type);
return FOPEN_EXCL_ERR;
}
}
@ -2547,6 +2859,7 @@ read_header:
logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno));
CLOSE_INVALIDATE (sock);
xfree (head);
xfree_null (type);
return FOPENERR;
}
}
@ -2560,49 +2873,26 @@ read_header:
HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file));
}
/* This confuses the timestamping code that checks for file size.
#### The timestamping code should be smarter about file size. */
if (opt.save_headers && hs->restval == 0)
fwrite (head, 1, strlen (head), fp);
err = read_response_body (hs, sock, fp, contlen, contrange,
chunked_transfer_encoding,
u->url, warc_timestamp_str,
warc_request_uuid, warc_ip, type,
statcode, head);
/* Now we no longer need to store the response header. */
xfree (head);
/* Download the request body. */
flags = 0;
if (contlen != -1)
/* If content-length is present, read that much; otherwise, read
until EOF. The HTTP spec doesn't require the server to
actually close the connection when it's done sending data. */
flags |= rb_read_exactly;
if (hs->restval > 0 && contrange == 0)
/* If the server ignored our range request, instruct fd_read_body
to skip the first RESTVAL bytes of body. */
flags |= rb_skip_startpos;
if (chunked_transfer_encoding)
flags |= rb_chunked_transfer_encoding;
hs->len = hs->restval;
hs->rd_size = 0;
hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0,
hs->restval, &hs->rd_size, &hs->len, &hs->dltime,
flags);
xfree_null (type);
if (hs->res >= 0)
CLOSE_FINISH (sock);
else
{
if (hs->res < 0)
hs->rderrmsg = xstrdup (fd_errstr (sock));
CLOSE_INVALIDATE (sock);
}
CLOSE_INVALIDATE (sock);
if (!output_stream)
fclose (fp);
if (hs->res == -2)
return FWRITEERR;
return RETRFINISHED;
return err;
}
/* The genuine HTTP loop! This is the part where the retrieval is
@ -2626,6 +2916,12 @@ http_loop (struct url *u, struct url *original_url, char **newloc,
char *file_name;
bool force_full_retrieve = false;
/* If we are writing to a WARC file: always retrieve the whole file. */
if (opt.warc_filename != NULL)
force_full_retrieve = true;
/* Assert that no value for *LOCAL_FILE was passed. */
assert (local_file == NULL || *local_file == NULL);
@ -2795,6 +3091,18 @@ Spider mode enabled. Check if remote file exists.\n"));
/* Fatal errors just return from the function. */
ret = err;
goto exit;
case WARC_ERR:
/* A fatal WARC error. */
logputs (LOG_VERBOSE, "\n");
logprintf (LOG_NOTQUIET, _("Cannot write to WARC file..\n"));
ret = err;
goto exit;
case WARC_TMP_FOPENERR: case WARC_TMP_FWRITEERR:
/* A fatal WARC error. */
logputs (LOG_VERBOSE, "\n");
logprintf (LOG_NOTQUIET, _("Cannot write to temporary WARC file.\n"));
ret = err;
goto exit;
case CONSSLERR:
/* Another fatal error. */
logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n"));

View File

@ -88,6 +88,7 @@ CMD_DECLARE (cmd_vector);
CMD_DECLARE (cmd_spec_dirstruct);
CMD_DECLARE (cmd_spec_header);
CMD_DECLARE (cmd_spec_warc_header);
CMD_DECLARE (cmd_spec_htmlify);
CMD_DECLARE (cmd_spec_mirror);
CMD_DECLARE (cmd_spec_prefer_family);
@ -264,6 +265,15 @@ static const struct {
{ "verbose", NULL, cmd_spec_verbose },
{ "wait", &opt.wait, cmd_time },
{ "waitretry", &opt.waitretry, cmd_time },
{ "warccdx", &opt.warc_cdx_enabled, cmd_boolean },
{ "warccdxdedup", &opt.warc_cdx_dedup_filename, cmd_file },
{ "warccompression", &opt.warc_compression_enabled, cmd_boolean },
{ "warcdigests", &opt.warc_digests_enabled, cmd_boolean },
{ "warcfile", &opt.warc_filename, cmd_file },
{ "warcheader", NULL, cmd_spec_warc_header },
{ "warckeeplog", &opt.warc_keep_log, cmd_boolean },
{ "warcmaxsize", &opt.warc_maxsize, cmd_bytes },
{ "warctempdir", &opt.warc_tempdir, cmd_directory },
#ifdef USE_WATT32
{ "wdebug", &opt.wdebug, cmd_boolean },
#endif
@ -362,6 +372,14 @@ defaults (void)
opt.useservertimestamps = true;
opt.show_all_dns_entries = false;
opt.warc_maxsize = 0; /* 1024 * 1024 * 1024; */
opt.warc_compression_enabled = true;
opt.warc_digests_enabled = true;
opt.warc_cdx_enabled = false;
opt.warc_cdx_dedup_filename = NULL;
opt.warc_tempdir = NULL;
opt.warc_keep_log = true;
}
/* Return the user's home directory (strdup-ed), or NULL if none is
@ -1235,6 +1253,27 @@ cmd_spec_header (const char *com, const char *val, void *place_ignored)
return true;
}
static bool
cmd_spec_warc_header (const char *com, const char *val, void *place_ignored)
{
/* Empty value means reset the list of headers. */
if (*val == '\0')
{
free_vec (opt.warc_user_headers);
opt.warc_user_headers = NULL;
return true;
}
if (!check_user_specified_header (val))
{
fprintf (stderr, _("%s: %s: Invalid WARC header %s.\n"),
exec_name, com, quote (val));
return false;
}
opt.warc_user_headers = vec_append (opt.warc_user_headers, val);
return true;
}
static bool
cmd_spec_htmlify (const char *com, const char *val, void *place_ignored)
{
@ -1639,6 +1678,7 @@ cleanup (void)
xfree_null (opt.http_user);
xfree_null (opt.http_passwd);
free_vec (opt.user_headers);
free_vec (opt.warc_user_headers);
# ifdef HAVE_SSL
xfree_null (opt.cert_file);
xfree_null (opt.private_key);

View File

@ -79,6 +79,10 @@ as that of the covered work. */
logging is inhibited, logfp is set back to NULL. */
static FILE *logfp;
/* A second file descriptor pointing to the temporary log file for the
WARC writer. If WARC writing is disabled, this is NULL. */
static FILE *warclogfp;
/* If true, it means logging is inhibited, i.e. nothing is printed or
stored. */
static bool inhibit_logging;
@ -304,6 +308,31 @@ get_log_fp (void)
return logfp;
return stderr;
}
/* Returns the file descriptor for the secondary log file. This is
WARCLOGFP, except if called before log_init, in which case it
returns stderr. This is useful in case someone calls a logging
function before log_init.
If logging is inhibited, return NULL. */
static FILE *
get_warc_log_fp (void)
{
if (inhibit_logging)
return NULL;
if (warclogfp)
return warclogfp;
return NULL;
}
/* Sets the file descriptor for the secondary log file. */
void
log_set_warc_log_fp (FILE * fp)
{
warclogfp = fp;
}
/* Log a literal string S. The string is logged as-is, without a
newline appended. */
@ -312,13 +341,17 @@ void
logputs (enum log_options o, const char *s)
{
FILE *fp;
FILE *warcfp;
check_redirect_output ();
if ((fp = get_log_fp ()) == NULL)
return;
warcfp = get_warc_log_fp ();
CHECK_VERBOSE (o);
FPUTS (s, fp);
if (warcfp != NULL)
FPUTS (s, warcfp);
if (save_context_p)
saved_append (s);
if (flush_log_p)
@ -356,8 +389,9 @@ log_vprintf_internal (struct logvprintf_state *state, const char *fmt,
int available_size = sizeof (smallmsg);
int numwritten;
FILE *fp = get_log_fp ();
FILE *warcfp = get_warc_log_fp ();
if (!save_context_p)
if (!save_context_p && warcfp == NULL)
{
/* In the simple case just call vfprintf(), to avoid needless
allocation and games with vsnprintf(). */
@ -407,8 +441,11 @@ log_vprintf_internal (struct logvprintf_state *state, const char *fmt,
}
/* Writing succeeded. */
saved_append (write_ptr);
if (save_context_p)
saved_append (write_ptr);
FPUTS (write_ptr, fp);
if (warcfp != NULL)
FPUTS (write_ptr, warcfp);
if (state->bigmsg)
xfree (state->bigmsg);
@ -426,6 +463,7 @@ void
logflush (void)
{
FILE *fp = get_log_fp ();
FILE *warcfp = get_warc_log_fp ();
if (fp)
{
/* 2005-10-25 SMS.
@ -440,6 +478,10 @@ logflush (void)
fflush (fp);
#endif /* def __VMS [else] */
}
if (warcfp != NULL)
fflush (warcfp);
needs_flushing = false;
}
@ -598,6 +640,7 @@ log_dump_context (void)
{
int num = log_line_current;
FILE *fp = get_log_fp ();
FILE *warcfp = get_warc_log_fp ();
if (!fp)
return;
@ -609,14 +652,23 @@ log_dump_context (void)
{
struct log_ln *ln = log_lines + num;
if (ln->content)
FPUTS (ln->content, fp);
{
FPUTS (ln->content, fp);
if (warcfp != NULL)
FPUTS (ln->content, warcfp);
}
ROT_ADVANCE (num);
}
while (num != log_line_current);
if (trailing_line)
if (log_lines[log_line_current].content)
FPUTS (log_lines[log_line_current].content, fp);
{
FPUTS (log_lines[log_line_current].content, fp);
if (warcfp != NULL)
FPUTS (log_lines[log_line_current].content, warcfp);
}
fflush (fp);
fflush (warcfp);
}
/* String escape functions. */

View File

@ -34,8 +34,12 @@ as that of the covered work. */
/* The log file to which Wget writes to after HUP. */
#define DEFAULT_LOGFILE "wget-log"
#include <stdio.h>
enum log_options { LOG_VERBOSE, LOG_NOTQUIET, LOG_NONVERBOSE, LOG_ALWAYS };
void log_set_warc_log_fp (FILE *);
void logprintf (enum log_options, const char *, ...)
GCC_FORMAT_ATTR (2, 3);
void debug_logprintf (const char *, ...) GCC_FORMAT_ATTR (1, 2);

View File

@ -55,6 +55,7 @@ as that of the covered work. */
#include "spider.h"
#include "http.h" /* for save_cookies */
#include "ptimer.h"
#include "warc.h"
#include <getopt.h>
#include <getpass.h>
@ -287,6 +288,15 @@ static struct cmdline_option option_data[] =
{ "version", 'V', OPT_FUNCALL, (void *) print_version, no_argument },
{ "wait", 'w', OPT_VALUE, "wait", -1 },
{ "waitretry", 0, OPT_VALUE, "waitretry", -1 },
{ "warc-cdx", 0, OPT_BOOLEAN, "warccdx", -1 },
{ "warc-compression", 0, OPT_BOOLEAN, "warccompression", -1 },
{ "warc-dedup", 0, OPT_VALUE, "warccdxdedup", -1 },
{ "warc-digests", 0, OPT_BOOLEAN, "warcdigests", -1 },
{ "warc-file", 0, OPT_VALUE, "warcfile", -1 },
{ "warc-header", 0, OPT_VALUE, "warcheader", -1 },
{ "warc-keep-log", 0, OPT_BOOLEAN, "warckeeplog", -1 },
{ "warc-max-size", 0, OPT_VALUE, "warcmaxsize", -1 },
{ "warc-tempdir", 0, OPT_VALUE, "warctempdir", -1 },
#ifdef USE_WATT32
{ "wdebug", 0, OPT_BOOLEAN, "wdebug", -1 },
#endif
@ -652,6 +662,29 @@ FTP options:\n"),
--retr-symlinks when recursing, get linked-to files (not dir).\n"),
"\n",
N_("\
WARC options:\n"),
N_("\
--warc-file=FILENAME save request/response data to a .warc.gz file.\n"),
N_("\
--warc-header=STRING insert STRING into the warcinfo record.\n"),
N_("\
--warc-max-size=NUMBER set maximum size of WARC files to NUMBER.\n"),
N_("\
--warc-cdx write CDX index files.\n"),
N_("\
--warc-dedup=FILENAME do not store records listed in this CDX file.\n"),
N_("\
--no-warc-compression do not compress WARC files with GZIP.\n"),
N_("\
--no-warc-digests do not calculate SHA1 digests.\n"),
N_("\
--no-warc-keep-log do not store the log file in a WARC record.\n"),
N_("\
--warc-tempdir=DIRECTORY location for temporary files created by the\n\
WARC writer.\n"),
"\n",
N_("\
Recursive download:\n"),
N_("\
@ -910,6 +943,7 @@ There is NO WARRANTY, to the extent permitted by law.\n"), stdout) < 0)
}
char *program_name; /* Needed by lib/error.c. */
char *program_argstring; /* Needed by wget_warc.c. */
int
main (int argc, char **argv)
@ -945,6 +979,22 @@ main (int argc, char **argv)
windows_main ((char **) &exec_name);
#endif
/* Construct the arguments string. */
int argstring_length = 1;
for (i = 1; i < argc; i++)
argstring_length += strlen (argv[i]) + 2 + 1;
char *p = program_argstring = malloc (argstring_length * sizeof (char));
for (i = 1; i < argc; i++)
{
*p++ = '"';
int arglen = strlen (argv[i]);
memcpy (p, argv[i], arglen);
p += arglen;
*p++ = '"';
*p++ = ' ';
}
*p = '\0';
/* Load the hard-coded defaults. */
defaults ();
@ -1194,6 +1244,47 @@ for details.\n\n"));
}
}
if (opt.warc_filename != 0)
{
if (opt.noclobber)
{
fprintf (stderr,
_("WARC output does not work with --no-clobber, "
"--no-clobber will be disabled.\n"));
opt.noclobber = false;
}
if (opt.timestamping)
{
fprintf (stderr,
_("WARC output does not work with timestamping, "
"timestamping will be disabled.\n"));
opt.timestamping = false;
}
if (opt.spider)
{
fprintf (stderr,
_("WARC output does not work with --spider.\n"));
exit (1);
}
if (opt.always_rest)
{
fprintf (stderr,
_("WARC output does not work with --continue, "
"--continue will be disabled.\n"));
opt.always_rest = false;
}
if (opt.warc_cdx_dedup_filename != 0 && !opt.warc_digests_enabled)
{
fprintf (stderr,
_("Digests are disabled; WARC deduplication will "
"not find duplicate records.\n"));
}
if (opt.warc_keep_log)
{
opt.progress_type = "dot";
}
}
if (opt.ask_passwd && opt.passwd)
{
fprintf (stderr,
@ -1273,6 +1364,10 @@ for details.\n\n"));
/* Initialize logging. */
log_init (opt.lfilename, append_to_log);
/* Open WARC file. */
if (opt.warc_filename != 0)
warc_init ();
DEBUGP (("DEBUG output created by Wget %s on %s.\n\n",
version_string, OS_TYPE));
@ -1472,7 +1567,12 @@ outputting to a regular file.\n"));
if (opt.convert_links && !opt.delete_after)
convert_all_links ();
/* Close WARC file. */
if (opt.warc_filename != 0)
warc_close ();
log_close ();
for (i = 0; i < nurl; i++)
xfree (url[i]);
cleanup ();

View File

@ -87,6 +87,15 @@ struct options
FTP. */
char *output_document; /* The output file to which the
documents will be printed. */
char *warc_filename; /* WARC output filename */
char *warc_tempdir; /* WARC temp dir */
char *warc_cdx_dedup_filename; /* CDX file to be used for deduplication. */
wgint warc_maxsize; /* WARC max archive size */
bool warc_compression_enabled; /* For GZIP compression. */
bool warc_digests_enabled; /* For SHA1 digests. */
bool warc_cdx_enabled; /* Create CDX files? */
bool warc_keep_log; /* Store the log file in a WARC record. */
char **warc_user_headers; /* User-defined WARC header(s). */
char *user; /* Generic username */
char *passwd; /* Generic password */

View File

@ -139,13 +139,16 @@ limit_bandwidth (wgint bytes, struct ptimer *timer)
/* Write data in BUF to OUT. However, if *SKIP is non-zero, skip that
amount of data and decrease SKIP. Increment *TOTAL by the amount
of data written. */
of data written. If OUT2 is not NULL, also write BUF to OUT2.
In case of error writing to OUT, -1 is returned. In case of error
writing to OUT2, -2 is returned. In case of any other error,
1 is returned. */
static int
write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
wgint *written)
write_data (FILE *out, FILE *out2, const char *buf, int bufsize,
wgint *skip, wgint *written)
{
if (!out)
if (out == NULL && out2 == NULL)
return 1;
if (*skip > bufsize)
{
@ -161,7 +164,10 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
return 1;
}
fwrite (buf, 1, bufsize, out);
if (out != NULL)
fwrite (buf, 1, bufsize, out);
if (out2 != NULL)
fwrite (buf, 1, bufsize, out2);
*written += bufsize;
/* Immediately flush the downloaded data. This should not hinder
@ -178,9 +184,17 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
actual justification. (Also, why 16K? Anyone test other values?)
*/
#ifndef __VMS
fflush (out);
if (out != NULL)
fflush (out);
if (out2 != NULL)
fflush (out2);
#endif /* ndef __VMS */
return !ferror (out);
if (out != NULL && ferror (out))
return -1;
else if (out2 != NULL && ferror (out2))
return -2;
else
return 0;
}
/* Read the contents of file descriptor FD until it the connection
@ -198,13 +212,17 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
the amount of data written to disk. The time it took to download
the data is stored to ELAPSED.
If OUT2 is non-NULL, the contents is also written to OUT2.
The function exits and returns the amount of data read. In case of
error while reading data, -1 is returned. In case of error while
writing data, -2 is returned. */
writing data to OUT, -2 is returned. In case of error while writing
data to OUT2, -3 is returned. */
int
fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
wgint *qtyread, wgint *qtywritten, double *elapsed, int flags)
wgint *qtyread, wgint *qtywritten, double *elapsed, int flags,
FILE *out2)
{
int ret = 0;
#undef max
@ -343,9 +361,10 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
if (ret > 0)
{
sum_read += ret;
if (!write_data (out, dlbuf, ret, &skip, &sum_written))
int write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written);
if (write_res != 0)
{
ret = -2;
ret = (write_res == -3) ? -3 : -2;
goto out;
}
if (chunked)

View File

@ -50,7 +50,7 @@ enum {
rb_chunked_transfer_encoding = 4
};
int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int);
int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int, FILE *);
typedef const char *(*hunk_terminator_t) (const char *, const char *, int);

View File

@ -46,6 +46,8 @@ const char *test_append_uri_pathel();
const char *test_are_urls_equal();
const char *test_is_robots_txt_url();
const char *program_argstring = "TEST";
int tests_run;
static const char *

1332
src/warc.c Normal file

File diff suppressed because it is too large Load Diff

19
src/warc.h Normal file
View File

@ -0,0 +1,19 @@
/* Declarations of WARC helper methods. */
#ifndef WARC_H
#define WARC_H
#include "host.h"
void warc_init ();
void warc_close ();
void warc_timestamp (char *timestamp);
void warc_uuid_str (char *id_str);
FILE * warc_tempfile ();
bool warc_write_request_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset);
bool warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location);
bool warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset);
#endif /* WARC_H */

View File

@ -353,7 +353,9 @@ typedef enum
PROXERR,
/* 50 */
AUTHFAILED, QUOTEXC, WRITEFAILED, SSLINITFAILED, VERIFCERTERR,
UNLINKERR, NEWLOCATION_KEEP_POST
UNLINKERR, NEWLOCATION_KEEP_POST,
WARC_ERR, WARC_TMP_FOPENERR, WARC_TMP_FWRITEERR
} uerr_t;
/* 2005-02-19 SMS.