mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Add support for WARC files.
This commit is contained in:
parent
a5fdba0958
commit
e3820953b2
@ -28,6 +28,7 @@ gnulib_modules="
|
||||
accept
|
||||
alloca
|
||||
announce-gen
|
||||
base32
|
||||
bind
|
||||
c-ctype
|
||||
clock-time
|
||||
@ -49,6 +50,7 @@ maintainer-makefile
|
||||
mbtowc
|
||||
mkdir
|
||||
crypto/md5
|
||||
crypto/sha1
|
||||
pipe
|
||||
quote
|
||||
quotearg
|
||||
@ -63,6 +65,7 @@ socket
|
||||
stdbool
|
||||
strcasestr
|
||||
strerror_r-posix
|
||||
tmpdir
|
||||
unlocked-io
|
||||
update-copyright
|
||||
vasprintf
|
||||
|
12
configure.ac
12
configure.ac
@ -511,7 +511,19 @@ if test "X$iri" != "Xno"; then
|
||||
fi
|
||||
fi
|
||||
|
||||
dnl
|
||||
dnl Check for UUID
|
||||
dnl
|
||||
|
||||
AC_CHECK_HEADER(uuid/uuid.h,
|
||||
AC_CHECK_LIB(uuid, uuid_generate,
|
||||
[LIBS="${LIBS} -luuid"
|
||||
AC_DEFINE([HAVE_LIBUUID], 1,
|
||||
[Define if libuuid is available.])
|
||||
])
|
||||
)
|
||||
|
||||
|
||||
dnl Needed by src/Makefile.am
|
||||
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
|
||||
|
||||
|
@ -1,3 +1,6 @@
|
||||
2011-11-04 Giuseppe Scrivano <gscrivano@gnu.org>
|
||||
|
||||
|
||||
2011-10-07 Steven Schweda <address@hidden>
|
||||
|
||||
* connect.c: Add HAVE_SYS_SELECT_H and HAVE_SYS_SOCKET_H conditions
|
||||
@ -21,7 +24,10 @@
|
||||
* openssl.c (ssl_init): Add type cast (SSL_METHOD *) to newly "const"
|
||||
"meth" argument to accommodate OpenSSL version 0.9.8, where that
|
||||
argument is not "const" in the OpenSSL function (SSL_CTX_new).
|
||||
* test.c: Declare "program_argstring".
|
||||
* utils.c (fopen_excl): Comment typography.
|
||||
* warc.h: New file.
|
||||
* warc.c: New file.
|
||||
|
||||
2011-10-02 Henrik Holst <henrik.holst@millistream.com> (tiny change)
|
||||
* http.c (gethttp): If 'contentonerror' is used then do not
|
||||
|
@ -46,13 +46,13 @@ wget_SOURCES = cmpt.c connect.c convert.c cookies.c ftp.c \
|
||||
css_.c css-url.c \
|
||||
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
|
||||
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
|
||||
recur.c res.c retr.c spider.c url.c \
|
||||
recur.c res.c retr.c spider.c url.c warc.c \
|
||||
utils.c exits.c build_info.c $(IRI_OBJ) \
|
||||
css-url.h css-tokens.h connect.h convert.h cookies.h \
|
||||
ftp.h hash.h host.h html-parse.h html-url.h \
|
||||
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
|
||||
options.h progress.h ptimer.h recur.h res.h retr.h \
|
||||
spider.h ssl.h sysdep.h url.h utils.h wget.h iri.h \
|
||||
spider.h ssl.h sysdep.h url.h warc.h utils.h wget.h iri.h \
|
||||
exits.h gettext.h
|
||||
nodist_wget_SOURCES = version.c
|
||||
EXTRA_wget_SOURCES = iri.c
|
||||
|
67
src/ftp.c
67
src/ftp.c
@ -49,6 +49,7 @@ as that of the covered work. */
|
||||
#include "netrc.h"
|
||||
#include "convert.h" /* for downloaded_file */
|
||||
#include "recur.h" /* for INFINITE_RECURSION */
|
||||
#include "warc.h"
|
||||
|
||||
#ifdef __VMS
|
||||
# include "vms.h"
|
||||
@ -237,10 +238,11 @@ static uerr_t ftp_get_listing (struct url *, ccon *, struct fileinfo **);
|
||||
|
||||
/* Retrieves a file with denoted parameters through opening an FTP
|
||||
connection to the server. It always closes the data connection,
|
||||
and closes the control connection in case of error. */
|
||||
and closes the control connection in case of error. If warc_tmp
|
||||
is non-NULL, the downloaded data will be written there as well. */
|
||||
static uerr_t
|
||||
getftp (struct url *u, wgint passed_expected_bytes, wgint *qtyread,
|
||||
wgint restval, ccon *con, int count)
|
||||
wgint restval, ccon *con, int count, FILE *warc_tmp)
|
||||
{
|
||||
int csock, dtsock, local_sock, res;
|
||||
uerr_t err = RETROK; /* appease the compiler */
|
||||
@ -1155,7 +1157,7 @@ Error in server response, closing control connection.\n"));
|
||||
/* 2011-09-30 SMS.
|
||||
Added listing files to the set of non-"binary" (text, Stream_LF)
|
||||
files. (Wget works either way, but other programs, like, say, text
|
||||
editors, work better on listing files which have text attributes.)
|
||||
editors, work better on listing files which have text attributes.)
|
||||
Now we use "binary" attributes for a binary ("IMAGE") transfer,
|
||||
unless "--ftp-stmlf" was specified, and we always use non-"binary"
|
||||
(text, Stream_LF) attributes for a listing file, or for an ASCII
|
||||
@ -1194,7 +1196,7 @@ Error in server response, closing control connection.\n"));
|
||||
}
|
||||
else if (opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct
|
||||
|| opt.output_document || count > 0)
|
||||
{
|
||||
{
|
||||
if (opt.unlink && file_exists_p (con->target))
|
||||
{
|
||||
int res = unlink (con->target);
|
||||
@ -1274,7 +1276,7 @@ Error in server response, closing control connection.\n"));
|
||||
rd_size = 0;
|
||||
res = fd_read_body (dtsock, fp,
|
||||
expected_bytes ? expected_bytes - restval : 0,
|
||||
restval, &rd_size, qtyread, &con->dltime, flags);
|
||||
restval, &rd_size, qtyread, &con->dltime, flags, warc_tmp);
|
||||
|
||||
tms = datetime_str (time (NULL));
|
||||
tmrate = retr_rate (rd_size, con->dltime);
|
||||
@ -1285,15 +1287,18 @@ Error in server response, closing control connection.\n"));
|
||||
if (!output_stream || con->cmd & DO_LIST)
|
||||
fclose (fp);
|
||||
|
||||
/* If fd_read_body couldn't write to fp, bail out. */
|
||||
if (res == -2)
|
||||
/* If fd_read_body couldn't write to fp or warc_tmp, bail out. */
|
||||
if (res == -2 || (warc_tmp != NULL && res == -3))
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, _("%s: %s, closing control connection.\n"),
|
||||
con->target, strerror (errno));
|
||||
fd_close (csock);
|
||||
con->csock = -1;
|
||||
fd_close (dtsock);
|
||||
return FWRITEERR;
|
||||
if (res == -2)
|
||||
return FWRITEERR;
|
||||
else if (res == -3)
|
||||
return WARC_TMP_FWRITEERR;
|
||||
}
|
||||
else if (res == -1)
|
||||
{
|
||||
@ -1409,6 +1414,11 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
|
||||
uerr_t err;
|
||||
struct_stat st;
|
||||
|
||||
/* Declare WARC variables. */
|
||||
bool warc_enabled = (opt.warc_filename != NULL);
|
||||
FILE *warc_tmp = NULL;
|
||||
ip_address *warc_ip = NULL;
|
||||
|
||||
/* Get the target, and set the name for the message accordingly. */
|
||||
if ((f == NULL) && (con->target))
|
||||
{
|
||||
@ -1445,6 +1455,21 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
|
||||
|
||||
orig_lp = con->cmd & LEAVE_PENDING ? 1 : 0;
|
||||
|
||||
/* For file RETR requests, we can write a WARC record.
|
||||
We record the file contents to a temporary file. */
|
||||
if (warc_enabled && (con->cmd & DO_RETR))
|
||||
{
|
||||
warc_tmp = warc_tempfile ();
|
||||
if (warc_tmp == NULL)
|
||||
return WARC_TMP_FOPENERR;
|
||||
|
||||
if (!con->proxy && con->csock != -1)
|
||||
{
|
||||
warc_ip = (ip_address *) alloca (sizeof (ip_address));
|
||||
socket_ip_address (con->csock, warc_ip, ENDPOINT_PEER);
|
||||
}
|
||||
}
|
||||
|
||||
/* THE loop. */
|
||||
do
|
||||
{
|
||||
@ -1509,7 +1534,10 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
|
||||
len = f->size;
|
||||
else
|
||||
len = 0;
|
||||
err = getftp (u, len, &qtyread, restval, con, count);
|
||||
|
||||
/* If we are working on a WARC record, getftp should also write
|
||||
to the warc_tmp file. */
|
||||
err = getftp (u, len, &qtyread, restval, con, count, warc_tmp);
|
||||
|
||||
if (con->csock == -1)
|
||||
con->st &= ~DONE_CWD;
|
||||
@ -1520,8 +1548,10 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
|
||||
{
|
||||
case HOSTERR: case CONIMPOSSIBLE: case FWRITEERR: case FOPENERR:
|
||||
case FTPNSFOD: case FTPLOGINC: case FTPNOPASV: case CONTNOTSUPPORTED:
|
||||
case UNLINKERR:
|
||||
case UNLINKERR: case WARC_TMP_FWRITEERR:
|
||||
/* Fatal errors, give up. */
|
||||
if (warc_tmp != NULL)
|
||||
fclose (warc_tmp);
|
||||
return err;
|
||||
case CONSOCKERR: case CONERROR: case FTPSRVERR: case FTPRERR:
|
||||
case WRITEFAILED: case FTPUNKNOWNTYPE: case FTPSYSERR:
|
||||
@ -1589,6 +1619,19 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
|
||||
xfree (hurl);
|
||||
}
|
||||
|
||||
if (warc_enabled && (con->cmd & DO_RETR))
|
||||
{
|
||||
/* Create and store a WARC resource record for the retrieved file. */
|
||||
bool warc_res;
|
||||
|
||||
warc_res = warc_write_resource_record (NULL, u->url, NULL, NULL,
|
||||
warc_ip, NULL, warc_tmp, -1);
|
||||
if (! warc_res)
|
||||
return WARC_ERR;
|
||||
|
||||
/* warc_write_resource_record has also closed warc_tmp. */
|
||||
}
|
||||
|
||||
if ((con->cmd & DO_LIST))
|
||||
/* This is a directory listing file. */
|
||||
{
|
||||
@ -1928,7 +1971,9 @@ Already have correct symlink %s -> %s\n\n"),
|
||||
xfree (ofile);
|
||||
|
||||
/* Break on fatals. */
|
||||
if (err == QUOTEXC || err == HOSTERR || err == FWRITEERR)
|
||||
if (err == QUOTEXC || err == HOSTERR || err == FWRITEERR
|
||||
|| err == WARC_ERR || err == WARC_TMP_FOPENERR
|
||||
|| err == WARC_TMP_FWRITEERR)
|
||||
break;
|
||||
con->cmd &= ~ (DO_CWD | DO_LOGIN);
|
||||
f = f->next;
|
||||
|
438
src/http.c
438
src/http.c
@ -58,6 +58,7 @@ as that of the covered work. */
|
||||
#include "md5.h"
|
||||
#include "convert.h"
|
||||
#include "spider.h"
|
||||
#include "warc.h"
|
||||
|
||||
#ifdef TESTING
|
||||
#include "test.h"
|
||||
@ -320,10 +321,12 @@ request_remove_header (struct request *req, char *name)
|
||||
p += A_len; \
|
||||
} while (0)
|
||||
|
||||
/* Construct the request and write it to FD using fd_write. */
|
||||
/* Construct the request and write it to FD using fd_write.
|
||||
If warc_tmp is set to a file pointer, the request string will
|
||||
also be written to that file. */
|
||||
|
||||
static int
|
||||
request_send (const struct request *req, int fd)
|
||||
request_send (const struct request *req, int fd, FILE *warc_tmp)
|
||||
{
|
||||
char *request_string, *p;
|
||||
int i, size, write_error;
|
||||
@ -374,6 +377,13 @@ request_send (const struct request *req, int fd)
|
||||
if (write_error < 0)
|
||||
logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
|
||||
fd_errstr (fd));
|
||||
else if (warc_tmp != NULL)
|
||||
{
|
||||
/* Write a copy of the data to the WARC record. */
|
||||
int warc_tmp_written = fwrite (request_string, 1, size - 1, warc_tmp);
|
||||
if (warc_tmp_written != size - 1)
|
||||
return -2;
|
||||
}
|
||||
return write_error;
|
||||
}
|
||||
|
||||
@ -444,10 +454,12 @@ register_basic_auth_host (const char *hostname)
|
||||
|
||||
/* Send the contents of FILE_NAME to SOCK. Make sure that exactly
|
||||
PROMISED_SIZE bytes are sent over the wire -- if the file is
|
||||
longer, read only that much; if the file is shorter, report an error. */
|
||||
longer, read only that much; if the file is shorter, report an error.
|
||||
If warc_tmp is set to a file pointer, the post data will
|
||||
also be written to that file. */
|
||||
|
||||
static int
|
||||
post_file (int sock, const char *file_name, wgint promised_size)
|
||||
post_file (int sock, const char *file_name, wgint promised_size, FILE *warc_tmp)
|
||||
{
|
||||
static char chunk[8192];
|
||||
wgint written = 0;
|
||||
@ -472,6 +484,16 @@ post_file (int sock, const char *file_name, wgint promised_size)
|
||||
fclose (fp);
|
||||
return -1;
|
||||
}
|
||||
if (warc_tmp != NULL)
|
||||
{
|
||||
/* Write a copy of the data to the WARC record. */
|
||||
int warc_tmp_written = fwrite (chunk, 1, towrite, warc_tmp);
|
||||
if (warc_tmp_written != towrite)
|
||||
{
|
||||
fclose (fp);
|
||||
return -2;
|
||||
}
|
||||
}
|
||||
written += towrite;
|
||||
}
|
||||
fclose (fp);
|
||||
@ -1462,6 +1484,135 @@ File %s already there; not retrieving.\n\n"), quote (filename));
|
||||
*dt |= TEXTHTML;
|
||||
}
|
||||
|
||||
/* Download the response body from the socket and writes it to
|
||||
an output file. The headers have already been read from the
|
||||
socket. If WARC is enabled, the response body will also be
|
||||
written to a WARC response record.
|
||||
|
||||
hs, contlen, contrange, chunked_transfer_encoding and url are
|
||||
parameters from the gethttp method. fp is a pointer to the
|
||||
output file.
|
||||
|
||||
url, warc_timestamp_str, warc_request_uuid, warc_ip, type
|
||||
and statcode will be saved in the headers of the WARC record.
|
||||
The head parameter contains the HTTP headers of the response.
|
||||
|
||||
If fp is NULL and WARC is enabled, the response body will be
|
||||
written only to the WARC file. If WARC is disabled and fp
|
||||
is a file pointer, the data will be written to the file.
|
||||
If fp is a file pointer and WARC is enabled, the body will
|
||||
be written to both destinations.
|
||||
|
||||
Returns the error code. */
|
||||
static int
|
||||
read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen,
|
||||
wgint contrange, bool chunked_transfer_encoding,
|
||||
char *url, char *warc_timestamp_str, char *warc_request_uuid,
|
||||
ip_address *warc_ip, char *type, int statcode, char *head)
|
||||
{
|
||||
int warc_payload_offset = 0;
|
||||
FILE *warc_tmp = NULL;
|
||||
int warcerr = 0;
|
||||
|
||||
if (opt.warc_filename != NULL)
|
||||
{
|
||||
/* Open a temporary file where we can write the response before we
|
||||
add it to the WARC record. */
|
||||
warc_tmp = warc_tempfile ();
|
||||
if (warc_tmp == NULL)
|
||||
warcerr = WARC_TMP_FOPENERR;
|
||||
|
||||
if (warcerr == 0)
|
||||
{
|
||||
/* We should keep the response headers for the WARC record. */
|
||||
int head_len = strlen (head);
|
||||
int warc_tmp_written = fwrite (head, 1, head_len, warc_tmp);
|
||||
if (warc_tmp_written != head_len)
|
||||
warcerr = WARC_TMP_FWRITEERR;
|
||||
warc_payload_offset = head_len;
|
||||
}
|
||||
|
||||
if (warcerr != 0)
|
||||
{
|
||||
if (warc_tmp != NULL)
|
||||
fclose (warc_tmp);
|
||||
return warcerr;
|
||||
}
|
||||
}
|
||||
|
||||
if (fp != NULL)
|
||||
{
|
||||
/* This confuses the timestamping code that checks for file size.
|
||||
#### The timestamping code should be smarter about file size. */
|
||||
if (opt.save_headers && hs->restval == 0)
|
||||
fwrite (head, 1, strlen (head), fp);
|
||||
}
|
||||
|
||||
/* Read the response body. */
|
||||
int flags = 0;
|
||||
if (contlen != -1)
|
||||
/* If content-length is present, read that much; otherwise, read
|
||||
until EOF. The HTTP spec doesn't require the server to
|
||||
actually close the connection when it's done sending data. */
|
||||
flags |= rb_read_exactly;
|
||||
if (fp != NULL && hs->restval > 0 && contrange == 0)
|
||||
/* If the server ignored our range request, instruct fd_read_body
|
||||
to skip the first RESTVAL bytes of body. */
|
||||
flags |= rb_skip_startpos;
|
||||
if (chunked_transfer_encoding)
|
||||
flags |= rb_chunked_transfer_encoding;
|
||||
|
||||
hs->len = hs->restval;
|
||||
hs->rd_size = 0;
|
||||
/* Download the response body and write it to fp.
|
||||
If we are working on a WARC file, we simultaneously write the
|
||||
response body to warc_tmp. */
|
||||
hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0,
|
||||
hs->restval, &hs->rd_size, &hs->len, &hs->dltime,
|
||||
flags, warc_tmp);
|
||||
if (hs->res >= 0)
|
||||
{
|
||||
if (warc_tmp != NULL)
|
||||
{
|
||||
/* Create a response record and write it to the WARC file.
|
||||
Note: per the WARC standard, the request and response should share
|
||||
the same date header. We re-use the timestamp of the request.
|
||||
The response record should also refer to the uuid of the request. */
|
||||
bool r = warc_write_response_record (url, warc_timestamp_str,
|
||||
warc_request_uuid, warc_ip,
|
||||
warc_tmp, warc_payload_offset,
|
||||
type, statcode, hs->newloc);
|
||||
|
||||
/* warc_write_response_record has closed warc_tmp. */
|
||||
|
||||
if (! r)
|
||||
return WARC_ERR;
|
||||
}
|
||||
|
||||
return RETRFINISHED;
|
||||
}
|
||||
|
||||
if (warc_tmp != NULL)
|
||||
fclose (warc_tmp);
|
||||
|
||||
if (hs->res == -2)
|
||||
{
|
||||
/* Error while writing to fd. */
|
||||
return FWRITEERR;
|
||||
}
|
||||
else if (hs->res == -3)
|
||||
{
|
||||
/* Error while writing to warc_tmp. */
|
||||
return WARC_TMP_FWRITEERR;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* A read error! */
|
||||
hs->rderrmsg = xstrdup (fd_errstr (sock));
|
||||
return RETRFINISHED;
|
||||
}
|
||||
}
|
||||
|
||||
#define BEGINS_WITH(line, string_constant) \
|
||||
(!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \
|
||||
&& (c_isspace (line[sizeof (string_constant) - 1]) \
|
||||
@ -1519,9 +1670,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
||||
wgint contlen, contrange;
|
||||
struct url *conn;
|
||||
FILE *fp;
|
||||
int err;
|
||||
|
||||
int sock = -1;
|
||||
int flags;
|
||||
|
||||
/* Set to 1 when the authorization has already been sent and should
|
||||
not be tried again. */
|
||||
@ -1547,6 +1698,14 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
||||
char hdrval[256];
|
||||
char *message;
|
||||
|
||||
/* Declare WARC variables. */
|
||||
bool warc_enabled = (opt.warc_filename != NULL);
|
||||
FILE *warc_tmp = NULL;
|
||||
char warc_timestamp_str [21];
|
||||
char warc_request_uuid [48];
|
||||
ip_address *warc_ip = NULL;
|
||||
long int warc_payload_offset = -1;
|
||||
|
||||
/* Whether this connection will be kept alive after the HTTP request
|
||||
is done. */
|
||||
bool keep_alive;
|
||||
@ -1852,7 +2011,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
||||
that the contents of Host would be exactly the same as
|
||||
the contents of CONNECT. */
|
||||
|
||||
write_error = request_send (connreq, sock);
|
||||
write_error = request_send (connreq, sock, 0);
|
||||
request_free (connreq);
|
||||
if (write_error < 0)
|
||||
{
|
||||
@ -1924,8 +2083,26 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
||||
#endif /* HAVE_SSL */
|
||||
}
|
||||
|
||||
/* Open the temporary file where we will write the request. */
|
||||
if (warc_enabled)
|
||||
{
|
||||
warc_tmp = warc_tempfile ();
|
||||
if (warc_tmp == NULL)
|
||||
{
|
||||
CLOSE_INVALIDATE (sock);
|
||||
request_free (req);
|
||||
return WARC_TMP_FOPENERR;
|
||||
}
|
||||
|
||||
if (! proxy)
|
||||
{
|
||||
warc_ip = (ip_address *) alloca (sizeof (ip_address));
|
||||
socket_ip_address (sock, warc_ip, ENDPOINT_PEER);
|
||||
}
|
||||
}
|
||||
|
||||
/* Send the request to server. */
|
||||
write_error = request_send (req, sock);
|
||||
write_error = request_send (req, sock, warc_tmp);
|
||||
|
||||
if (write_error >= 0)
|
||||
{
|
||||
@ -1933,16 +2110,39 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
||||
{
|
||||
DEBUGP (("[POST data: %s]\n", opt.post_data));
|
||||
write_error = fd_write (sock, opt.post_data, post_data_size, -1);
|
||||
if (write_error >= 0 && warc_tmp != NULL)
|
||||
{
|
||||
/* Remember end of headers / start of payload. */
|
||||
warc_payload_offset = ftell (warc_tmp);
|
||||
|
||||
/* Write a copy of the data to the WARC record. */
|
||||
int warc_tmp_written = fwrite (opt.post_data, 1, post_data_size, warc_tmp);
|
||||
if (warc_tmp_written != post_data_size)
|
||||
write_error = -2;
|
||||
}
|
||||
}
|
||||
else if (opt.post_file_name && post_data_size != 0)
|
||||
write_error = post_file (sock, opt.post_file_name, post_data_size);
|
||||
{
|
||||
if (warc_tmp != NULL)
|
||||
/* Remember end of headers / start of payload. */
|
||||
warc_payload_offset = ftell (warc_tmp);
|
||||
|
||||
write_error = post_file (sock, opt.post_file_name, post_data_size, warc_tmp);
|
||||
}
|
||||
}
|
||||
|
||||
if (write_error < 0)
|
||||
{
|
||||
CLOSE_INVALIDATE (sock);
|
||||
request_free (req);
|
||||
return WRITEFAILED;
|
||||
|
||||
if (warc_tmp != NULL)
|
||||
fclose (warc_tmp);
|
||||
|
||||
if (write_error == -2)
|
||||
return WARC_TMP_FWRITEERR;
|
||||
else
|
||||
return WRITEFAILED;
|
||||
}
|
||||
logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
|
||||
proxy ? "Proxy" : "HTTP");
|
||||
@ -1950,6 +2150,29 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
||||
contrange = 0;
|
||||
*dt &= ~RETROKF;
|
||||
|
||||
|
||||
if (warc_enabled)
|
||||
{
|
||||
bool warc_result;
|
||||
/* Generate a timestamp and uuid for this request. */
|
||||
warc_timestamp (warc_timestamp_str);
|
||||
warc_uuid_str (warc_request_uuid);
|
||||
|
||||
/* Create a request record and store it in the WARC file. */
|
||||
warc_result = warc_write_request_record (u->url, warc_timestamp_str,
|
||||
warc_request_uuid, warc_ip,
|
||||
warc_tmp, warc_payload_offset);
|
||||
if (! warc_result)
|
||||
{
|
||||
CLOSE_INVALIDATE (sock);
|
||||
request_free (req);
|
||||
return WARC_ERR;
|
||||
}
|
||||
|
||||
/* warc_write_request_record has also closed warc_tmp. */
|
||||
}
|
||||
|
||||
|
||||
read_header:
|
||||
head = read_http_response_head (sock);
|
||||
if (!head)
|
||||
@ -2073,11 +2296,42 @@ read_header:
|
||||
if (statcode == HTTP_STATUS_UNAUTHORIZED)
|
||||
{
|
||||
/* Authorization is required. */
|
||||
if (keep_alive && !head_only
|
||||
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
|
||||
CLOSE_FINISH (sock);
|
||||
|
||||
/* Normally we are not interested in the response body.
|
||||
But if we are writing a WARC file we are: we like to keep everyting. */
|
||||
if (warc_enabled)
|
||||
{
|
||||
int err;
|
||||
type = resp_header_strdup (resp, "Content-Type");
|
||||
err = read_response_body (hs, sock, NULL, contlen, 0,
|
||||
chunked_transfer_encoding,
|
||||
u->url, warc_timestamp_str,
|
||||
warc_request_uuid, warc_ip, type,
|
||||
statcode, head);
|
||||
xfree_null (type);
|
||||
|
||||
if (err != RETRFINISHED || hs->res < 0)
|
||||
{
|
||||
CLOSE_INVALIDATE (sock);
|
||||
request_free (req);
|
||||
xfree_null (message);
|
||||
resp_free (resp);
|
||||
xfree (head);
|
||||
return err;
|
||||
}
|
||||
else
|
||||
CLOSE_FINISH (sock);
|
||||
}
|
||||
else
|
||||
CLOSE_INVALIDATE (sock);
|
||||
{
|
||||
/* Since WARC is disabled, we are not interested in the response body. */
|
||||
if (keep_alive && !head_only
|
||||
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
|
||||
CLOSE_FINISH (sock);
|
||||
else
|
||||
CLOSE_INVALIDATE (sock);
|
||||
}
|
||||
|
||||
pconn.authorized = false;
|
||||
if (!auth_finished && (user && passwd))
|
||||
{
|
||||
@ -2325,11 +2579,42 @@ read_header:
|
||||
_("Location: %s%s\n"),
|
||||
hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"),
|
||||
hs->newloc ? _(" [following]") : "");
|
||||
if (keep_alive && !head_only
|
||||
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
|
||||
CLOSE_FINISH (sock);
|
||||
|
||||
/* In case the caller cares to look... */
|
||||
hs->len = 0;
|
||||
hs->res = 0;
|
||||
hs->restval = 0;
|
||||
|
||||
/* Normally we are not interested in the response body of a redirect.
|
||||
But if we are writing a WARC file we are: we like to keep everyting. */
|
||||
if (warc_enabled)
|
||||
{
|
||||
int err = read_response_body (hs, sock, NULL, contlen, 0,
|
||||
chunked_transfer_encoding,
|
||||
u->url, warc_timestamp_str,
|
||||
warc_request_uuid, warc_ip, type,
|
||||
statcode, head);
|
||||
|
||||
if (err != RETRFINISHED || hs->res < 0)
|
||||
{
|
||||
CLOSE_INVALIDATE (sock);
|
||||
xfree_null (type);
|
||||
xfree (head);
|
||||
return err;
|
||||
}
|
||||
else
|
||||
CLOSE_FINISH (sock);
|
||||
}
|
||||
else
|
||||
CLOSE_INVALIDATE (sock);
|
||||
{
|
||||
/* Since WARC is disabled, we are not interested in the response body. */
|
||||
if (keep_alive && !head_only
|
||||
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
|
||||
CLOSE_FINISH (sock);
|
||||
else
|
||||
CLOSE_INVALIDATE (sock);
|
||||
}
|
||||
|
||||
xfree_null (type);
|
||||
xfree (head);
|
||||
/* From RFC2616: The status codes 303 and 307 have
|
||||
@ -2447,8 +2732,6 @@ read_header:
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
}
|
||||
}
|
||||
xfree_null (type);
|
||||
type = NULL; /* We don't need it any more. */
|
||||
|
||||
/* Return if we have no intention of further downloading. */
|
||||
if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only)
|
||||
@ -2456,21 +2739,48 @@ read_header:
|
||||
/* In case the caller cares to look... */
|
||||
hs->len = 0;
|
||||
hs->res = 0;
|
||||
xfree_null (type);
|
||||
if (head_only)
|
||||
/* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the
|
||||
servers not to send body in response to a HEAD request, and
|
||||
those that do will likely be caught by test_socket_open.
|
||||
If not, they can be worked around using
|
||||
`--no-http-keep-alive'. */
|
||||
CLOSE_FINISH (sock);
|
||||
else if (keep_alive
|
||||
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
|
||||
/* Successfully skipped the body; also keep using the socket. */
|
||||
CLOSE_FINISH (sock);
|
||||
hs->restval = 0;
|
||||
|
||||
/* Normally we are not interested in the response body of a error responses.
|
||||
But if we are writing a WARC file we are: we like to keep everyting. */
|
||||
if (warc_enabled)
|
||||
{
|
||||
int err = read_response_body (hs, sock, NULL, contlen, 0,
|
||||
chunked_transfer_encoding,
|
||||
u->url, warc_timestamp_str,
|
||||
warc_request_uuid, warc_ip, type,
|
||||
statcode, head);
|
||||
|
||||
if (err != RETRFINISHED || hs->res < 0)
|
||||
{
|
||||
CLOSE_INVALIDATE (sock);
|
||||
xfree (head);
|
||||
xfree_null (type);
|
||||
return err;
|
||||
}
|
||||
else
|
||||
CLOSE_FINISH (sock);
|
||||
}
|
||||
else
|
||||
CLOSE_INVALIDATE (sock);
|
||||
{
|
||||
/* Since WARC is disabled, we are not interested in the response body. */
|
||||
if (head_only)
|
||||
/* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the
|
||||
servers not to send body in response to a HEAD request, and
|
||||
those that do will likely be caught by test_socket_open.
|
||||
If not, they can be worked around using
|
||||
`--no-http-keep-alive'. */
|
||||
CLOSE_FINISH (sock);
|
||||
else if (keep_alive
|
||||
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
|
||||
/* Successfully skipped the body; also keep using the socket. */
|
||||
CLOSE_FINISH (sock);
|
||||
else
|
||||
CLOSE_INVALIDATE (sock);
|
||||
}
|
||||
|
||||
xfree (head);
|
||||
xfree_null (type);
|
||||
return RETRFINISHED;
|
||||
}
|
||||
|
||||
@ -2512,6 +2822,7 @@ read_header:
|
||||
strerror (errno));
|
||||
CLOSE_INVALIDATE (sock);
|
||||
xfree (head);
|
||||
xfree_null (type);
|
||||
return UNLINKERR;
|
||||
}
|
||||
}
|
||||
@ -2539,6 +2850,7 @@ read_header:
|
||||
hs->local_file);
|
||||
CLOSE_INVALIDATE (sock);
|
||||
xfree (head);
|
||||
xfree_null (type);
|
||||
return FOPEN_EXCL_ERR;
|
||||
}
|
||||
}
|
||||
@ -2547,6 +2859,7 @@ read_header:
|
||||
logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno));
|
||||
CLOSE_INVALIDATE (sock);
|
||||
xfree (head);
|
||||
xfree_null (type);
|
||||
return FOPENERR;
|
||||
}
|
||||
}
|
||||
@ -2560,49 +2873,26 @@ read_header:
|
||||
HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file));
|
||||
}
|
||||
|
||||
/* This confuses the timestamping code that checks for file size.
|
||||
#### The timestamping code should be smarter about file size. */
|
||||
if (opt.save_headers && hs->restval == 0)
|
||||
fwrite (head, 1, strlen (head), fp);
|
||||
|
||||
err = read_response_body (hs, sock, fp, contlen, contrange,
|
||||
chunked_transfer_encoding,
|
||||
u->url, warc_timestamp_str,
|
||||
warc_request_uuid, warc_ip, type,
|
||||
statcode, head);
|
||||
|
||||
/* Now we no longer need to store the response header. */
|
||||
xfree (head);
|
||||
|
||||
/* Download the request body. */
|
||||
flags = 0;
|
||||
if (contlen != -1)
|
||||
/* If content-length is present, read that much; otherwise, read
|
||||
until EOF. The HTTP spec doesn't require the server to
|
||||
actually close the connection when it's done sending data. */
|
||||
flags |= rb_read_exactly;
|
||||
if (hs->restval > 0 && contrange == 0)
|
||||
/* If the server ignored our range request, instruct fd_read_body
|
||||
to skip the first RESTVAL bytes of body. */
|
||||
flags |= rb_skip_startpos;
|
||||
|
||||
if (chunked_transfer_encoding)
|
||||
flags |= rb_chunked_transfer_encoding;
|
||||
|
||||
hs->len = hs->restval;
|
||||
hs->rd_size = 0;
|
||||
hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0,
|
||||
hs->restval, &hs->rd_size, &hs->len, &hs->dltime,
|
||||
flags);
|
||||
xfree_null (type);
|
||||
|
||||
if (hs->res >= 0)
|
||||
CLOSE_FINISH (sock);
|
||||
else
|
||||
{
|
||||
if (hs->res < 0)
|
||||
hs->rderrmsg = xstrdup (fd_errstr (sock));
|
||||
CLOSE_INVALIDATE (sock);
|
||||
}
|
||||
CLOSE_INVALIDATE (sock);
|
||||
|
||||
if (!output_stream)
|
||||
fclose (fp);
|
||||
if (hs->res == -2)
|
||||
return FWRITEERR;
|
||||
return RETRFINISHED;
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/* The genuine HTTP loop! This is the part where the retrieval is
|
||||
@ -2626,6 +2916,12 @@ http_loop (struct url *u, struct url *original_url, char **newloc,
|
||||
char *file_name;
|
||||
bool force_full_retrieve = false;
|
||||
|
||||
|
||||
/* If we are writing to a WARC file: always retrieve the whole file. */
|
||||
if (opt.warc_filename != NULL)
|
||||
force_full_retrieve = true;
|
||||
|
||||
|
||||
/* Assert that no value for *LOCAL_FILE was passed. */
|
||||
assert (local_file == NULL || *local_file == NULL);
|
||||
|
||||
@ -2795,6 +3091,18 @@ Spider mode enabled. Check if remote file exists.\n"));
|
||||
/* Fatal errors just return from the function. */
|
||||
ret = err;
|
||||
goto exit;
|
||||
case WARC_ERR:
|
||||
/* A fatal WARC error. */
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
logprintf (LOG_NOTQUIET, _("Cannot write to WARC file..\n"));
|
||||
ret = err;
|
||||
goto exit;
|
||||
case WARC_TMP_FOPENERR: case WARC_TMP_FWRITEERR:
|
||||
/* A fatal WARC error. */
|
||||
logputs (LOG_VERBOSE, "\n");
|
||||
logprintf (LOG_NOTQUIET, _("Cannot write to temporary WARC file.\n"));
|
||||
ret = err;
|
||||
goto exit;
|
||||
case CONSSLERR:
|
||||
/* Another fatal error. */
|
||||
logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n"));
|
||||
|
40
src/init.c
40
src/init.c
@ -88,6 +88,7 @@ CMD_DECLARE (cmd_vector);
|
||||
|
||||
CMD_DECLARE (cmd_spec_dirstruct);
|
||||
CMD_DECLARE (cmd_spec_header);
|
||||
CMD_DECLARE (cmd_spec_warc_header);
|
||||
CMD_DECLARE (cmd_spec_htmlify);
|
||||
CMD_DECLARE (cmd_spec_mirror);
|
||||
CMD_DECLARE (cmd_spec_prefer_family);
|
||||
@ -264,6 +265,15 @@ static const struct {
|
||||
{ "verbose", NULL, cmd_spec_verbose },
|
||||
{ "wait", &opt.wait, cmd_time },
|
||||
{ "waitretry", &opt.waitretry, cmd_time },
|
||||
{ "warccdx", &opt.warc_cdx_enabled, cmd_boolean },
|
||||
{ "warccdxdedup", &opt.warc_cdx_dedup_filename, cmd_file },
|
||||
{ "warccompression", &opt.warc_compression_enabled, cmd_boolean },
|
||||
{ "warcdigests", &opt.warc_digests_enabled, cmd_boolean },
|
||||
{ "warcfile", &opt.warc_filename, cmd_file },
|
||||
{ "warcheader", NULL, cmd_spec_warc_header },
|
||||
{ "warckeeplog", &opt.warc_keep_log, cmd_boolean },
|
||||
{ "warcmaxsize", &opt.warc_maxsize, cmd_bytes },
|
||||
{ "warctempdir", &opt.warc_tempdir, cmd_directory },
|
||||
#ifdef USE_WATT32
|
||||
{ "wdebug", &opt.wdebug, cmd_boolean },
|
||||
#endif
|
||||
@ -362,6 +372,14 @@ defaults (void)
|
||||
|
||||
opt.useservertimestamps = true;
|
||||
opt.show_all_dns_entries = false;
|
||||
|
||||
opt.warc_maxsize = 0; /* 1024 * 1024 * 1024; */
|
||||
opt.warc_compression_enabled = true;
|
||||
opt.warc_digests_enabled = true;
|
||||
opt.warc_cdx_enabled = false;
|
||||
opt.warc_cdx_dedup_filename = NULL;
|
||||
opt.warc_tempdir = NULL;
|
||||
opt.warc_keep_log = true;
|
||||
}
|
||||
|
||||
/* Return the user's home directory (strdup-ed), or NULL if none is
|
||||
@ -1235,6 +1253,27 @@ cmd_spec_header (const char *com, const char *val, void *place_ignored)
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
cmd_spec_warc_header (const char *com, const char *val, void *place_ignored)
|
||||
{
|
||||
/* Empty value means reset the list of headers. */
|
||||
if (*val == '\0')
|
||||
{
|
||||
free_vec (opt.warc_user_headers);
|
||||
opt.warc_user_headers = NULL;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!check_user_specified_header (val))
|
||||
{
|
||||
fprintf (stderr, _("%s: %s: Invalid WARC header %s.\n"),
|
||||
exec_name, com, quote (val));
|
||||
return false;
|
||||
}
|
||||
opt.warc_user_headers = vec_append (opt.warc_user_headers, val);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
cmd_spec_htmlify (const char *com, const char *val, void *place_ignored)
|
||||
{
|
||||
@ -1639,6 +1678,7 @@ cleanup (void)
|
||||
xfree_null (opt.http_user);
|
||||
xfree_null (opt.http_passwd);
|
||||
free_vec (opt.user_headers);
|
||||
free_vec (opt.warc_user_headers);
|
||||
# ifdef HAVE_SSL
|
||||
xfree_null (opt.cert_file);
|
||||
xfree_null (opt.private_key);
|
||||
|
60
src/log.c
60
src/log.c
@ -79,6 +79,10 @@ as that of the covered work. */
|
||||
logging is inhibited, logfp is set back to NULL. */
|
||||
static FILE *logfp;
|
||||
|
||||
/* A second file descriptor pointing to the temporary log file for the
|
||||
WARC writer. If WARC writing is disabled, this is NULL. */
|
||||
static FILE *warclogfp;
|
||||
|
||||
/* If true, it means logging is inhibited, i.e. nothing is printed or
|
||||
stored. */
|
||||
static bool inhibit_logging;
|
||||
@ -304,6 +308,31 @@ get_log_fp (void)
|
||||
return logfp;
|
||||
return stderr;
|
||||
}
|
||||
|
||||
/* Returns the file descriptor for the secondary log file. This is
|
||||
WARCLOGFP, except if called before log_init, in which case it
|
||||
returns stderr. This is useful in case someone calls a logging
|
||||
function before log_init.
|
||||
|
||||
If logging is inhibited, return NULL. */
|
||||
|
||||
static FILE *
|
||||
get_warc_log_fp (void)
|
||||
{
|
||||
if (inhibit_logging)
|
||||
return NULL;
|
||||
if (warclogfp)
|
||||
return warclogfp;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Sets the file descriptor for the secondary log file. */
|
||||
|
||||
void
|
||||
log_set_warc_log_fp (FILE * fp)
|
||||
{
|
||||
warclogfp = fp;
|
||||
}
|
||||
|
||||
/* Log a literal string S. The string is logged as-is, without a
|
||||
newline appended. */
|
||||
@ -312,13 +341,17 @@ void
|
||||
logputs (enum log_options o, const char *s)
|
||||
{
|
||||
FILE *fp;
|
||||
FILE *warcfp;
|
||||
|
||||
check_redirect_output ();
|
||||
if ((fp = get_log_fp ()) == NULL)
|
||||
return;
|
||||
warcfp = get_warc_log_fp ();
|
||||
CHECK_VERBOSE (o);
|
||||
|
||||
FPUTS (s, fp);
|
||||
if (warcfp != NULL)
|
||||
FPUTS (s, warcfp);
|
||||
if (save_context_p)
|
||||
saved_append (s);
|
||||
if (flush_log_p)
|
||||
@ -356,8 +389,9 @@ log_vprintf_internal (struct logvprintf_state *state, const char *fmt,
|
||||
int available_size = sizeof (smallmsg);
|
||||
int numwritten;
|
||||
FILE *fp = get_log_fp ();
|
||||
FILE *warcfp = get_warc_log_fp ();
|
||||
|
||||
if (!save_context_p)
|
||||
if (!save_context_p && warcfp == NULL)
|
||||
{
|
||||
/* In the simple case just call vfprintf(), to avoid needless
|
||||
allocation and games with vsnprintf(). */
|
||||
@ -407,8 +441,11 @@ log_vprintf_internal (struct logvprintf_state *state, const char *fmt,
|
||||
}
|
||||
|
||||
/* Writing succeeded. */
|
||||
saved_append (write_ptr);
|
||||
if (save_context_p)
|
||||
saved_append (write_ptr);
|
||||
FPUTS (write_ptr, fp);
|
||||
if (warcfp != NULL)
|
||||
FPUTS (write_ptr, warcfp);
|
||||
if (state->bigmsg)
|
||||
xfree (state->bigmsg);
|
||||
|
||||
@ -426,6 +463,7 @@ void
|
||||
logflush (void)
|
||||
{
|
||||
FILE *fp = get_log_fp ();
|
||||
FILE *warcfp = get_warc_log_fp ();
|
||||
if (fp)
|
||||
{
|
||||
/* 2005-10-25 SMS.
|
||||
@ -440,6 +478,10 @@ logflush (void)
|
||||
fflush (fp);
|
||||
#endif /* def __VMS [else] */
|
||||
}
|
||||
|
||||
if (warcfp != NULL)
|
||||
fflush (warcfp);
|
||||
|
||||
needs_flushing = false;
|
||||
}
|
||||
|
||||
@ -598,6 +640,7 @@ log_dump_context (void)
|
||||
{
|
||||
int num = log_line_current;
|
||||
FILE *fp = get_log_fp ();
|
||||
FILE *warcfp = get_warc_log_fp ();
|
||||
if (!fp)
|
||||
return;
|
||||
|
||||
@ -609,14 +652,23 @@ log_dump_context (void)
|
||||
{
|
||||
struct log_ln *ln = log_lines + num;
|
||||
if (ln->content)
|
||||
FPUTS (ln->content, fp);
|
||||
{
|
||||
FPUTS (ln->content, fp);
|
||||
if (warcfp != NULL)
|
||||
FPUTS (ln->content, warcfp);
|
||||
}
|
||||
ROT_ADVANCE (num);
|
||||
}
|
||||
while (num != log_line_current);
|
||||
if (trailing_line)
|
||||
if (log_lines[log_line_current].content)
|
||||
FPUTS (log_lines[log_line_current].content, fp);
|
||||
{
|
||||
FPUTS (log_lines[log_line_current].content, fp);
|
||||
if (warcfp != NULL)
|
||||
FPUTS (log_lines[log_line_current].content, warcfp);
|
||||
}
|
||||
fflush (fp);
|
||||
fflush (warcfp);
|
||||
}
|
||||
|
||||
/* String escape functions. */
|
||||
|
@ -34,8 +34,12 @@ as that of the covered work. */
|
||||
/* The log file to which Wget writes to after HUP. */
|
||||
#define DEFAULT_LOGFILE "wget-log"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
enum log_options { LOG_VERBOSE, LOG_NOTQUIET, LOG_NONVERBOSE, LOG_ALWAYS };
|
||||
|
||||
void log_set_warc_log_fp (FILE *);
|
||||
|
||||
void logprintf (enum log_options, const char *, ...)
|
||||
GCC_FORMAT_ATTR (2, 3);
|
||||
void debug_logprintf (const char *, ...) GCC_FORMAT_ATTR (1, 2);
|
||||
|
100
src/main.c
100
src/main.c
@ -55,6 +55,7 @@ as that of the covered work. */
|
||||
#include "spider.h"
|
||||
#include "http.h" /* for save_cookies */
|
||||
#include "ptimer.h"
|
||||
#include "warc.h"
|
||||
|
||||
#include <getopt.h>
|
||||
#include <getpass.h>
|
||||
@ -287,6 +288,15 @@ static struct cmdline_option option_data[] =
|
||||
{ "version", 'V', OPT_FUNCALL, (void *) print_version, no_argument },
|
||||
{ "wait", 'w', OPT_VALUE, "wait", -1 },
|
||||
{ "waitretry", 0, OPT_VALUE, "waitretry", -1 },
|
||||
{ "warc-cdx", 0, OPT_BOOLEAN, "warccdx", -1 },
|
||||
{ "warc-compression", 0, OPT_BOOLEAN, "warccompression", -1 },
|
||||
{ "warc-dedup", 0, OPT_VALUE, "warccdxdedup", -1 },
|
||||
{ "warc-digests", 0, OPT_BOOLEAN, "warcdigests", -1 },
|
||||
{ "warc-file", 0, OPT_VALUE, "warcfile", -1 },
|
||||
{ "warc-header", 0, OPT_VALUE, "warcheader", -1 },
|
||||
{ "warc-keep-log", 0, OPT_BOOLEAN, "warckeeplog", -1 },
|
||||
{ "warc-max-size", 0, OPT_VALUE, "warcmaxsize", -1 },
|
||||
{ "warc-tempdir", 0, OPT_VALUE, "warctempdir", -1 },
|
||||
#ifdef USE_WATT32
|
||||
{ "wdebug", 0, OPT_BOOLEAN, "wdebug", -1 },
|
||||
#endif
|
||||
@ -652,6 +662,29 @@ FTP options:\n"),
|
||||
--retr-symlinks when recursing, get linked-to files (not dir).\n"),
|
||||
"\n",
|
||||
|
||||
N_("\
|
||||
WARC options:\n"),
|
||||
N_("\
|
||||
--warc-file=FILENAME save request/response data to a .warc.gz file.\n"),
|
||||
N_("\
|
||||
--warc-header=STRING insert STRING into the warcinfo record.\n"),
|
||||
N_("\
|
||||
--warc-max-size=NUMBER set maximum size of WARC files to NUMBER.\n"),
|
||||
N_("\
|
||||
--warc-cdx write CDX index files.\n"),
|
||||
N_("\
|
||||
--warc-dedup=FILENAME do not store records listed in this CDX file.\n"),
|
||||
N_("\
|
||||
--no-warc-compression do not compress WARC files with GZIP.\n"),
|
||||
N_("\
|
||||
--no-warc-digests do not calculate SHA1 digests.\n"),
|
||||
N_("\
|
||||
--no-warc-keep-log do not store the log file in a WARC record.\n"),
|
||||
N_("\
|
||||
--warc-tempdir=DIRECTORY location for temporary files created by the\n\
|
||||
WARC writer.\n"),
|
||||
"\n",
|
||||
|
||||
N_("\
|
||||
Recursive download:\n"),
|
||||
N_("\
|
||||
@ -910,6 +943,7 @@ There is NO WARRANTY, to the extent permitted by law.\n"), stdout) < 0)
|
||||
}
|
||||
|
||||
char *program_name; /* Needed by lib/error.c. */
|
||||
char *program_argstring; /* Needed by wget_warc.c. */
|
||||
|
||||
int
|
||||
main (int argc, char **argv)
|
||||
@ -945,6 +979,22 @@ main (int argc, char **argv)
|
||||
windows_main ((char **) &exec_name);
|
||||
#endif
|
||||
|
||||
/* Construct the arguments string. */
|
||||
int argstring_length = 1;
|
||||
for (i = 1; i < argc; i++)
|
||||
argstring_length += strlen (argv[i]) + 2 + 1;
|
||||
char *p = program_argstring = malloc (argstring_length * sizeof (char));
|
||||
for (i = 1; i < argc; i++)
|
||||
{
|
||||
*p++ = '"';
|
||||
int arglen = strlen (argv[i]);
|
||||
memcpy (p, argv[i], arglen);
|
||||
p += arglen;
|
||||
*p++ = '"';
|
||||
*p++ = ' ';
|
||||
}
|
||||
*p = '\0';
|
||||
|
||||
/* Load the hard-coded defaults. */
|
||||
defaults ();
|
||||
|
||||
@ -1194,6 +1244,47 @@ for details.\n\n"));
|
||||
}
|
||||
}
|
||||
|
||||
if (opt.warc_filename != 0)
|
||||
{
|
||||
if (opt.noclobber)
|
||||
{
|
||||
fprintf (stderr,
|
||||
_("WARC output does not work with --no-clobber, "
|
||||
"--no-clobber will be disabled.\n"));
|
||||
opt.noclobber = false;
|
||||
}
|
||||
if (opt.timestamping)
|
||||
{
|
||||
fprintf (stderr,
|
||||
_("WARC output does not work with timestamping, "
|
||||
"timestamping will be disabled.\n"));
|
||||
opt.timestamping = false;
|
||||
}
|
||||
if (opt.spider)
|
||||
{
|
||||
fprintf (stderr,
|
||||
_("WARC output does not work with --spider.\n"));
|
||||
exit (1);
|
||||
}
|
||||
if (opt.always_rest)
|
||||
{
|
||||
fprintf (stderr,
|
||||
_("WARC output does not work with --continue, "
|
||||
"--continue will be disabled.\n"));
|
||||
opt.always_rest = false;
|
||||
}
|
||||
if (opt.warc_cdx_dedup_filename != 0 && !opt.warc_digests_enabled)
|
||||
{
|
||||
fprintf (stderr,
|
||||
_("Digests are disabled; WARC deduplication will "
|
||||
"not find duplicate records.\n"));
|
||||
}
|
||||
if (opt.warc_keep_log)
|
||||
{
|
||||
opt.progress_type = "dot";
|
||||
}
|
||||
}
|
||||
|
||||
if (opt.ask_passwd && opt.passwd)
|
||||
{
|
||||
fprintf (stderr,
|
||||
@ -1273,6 +1364,10 @@ for details.\n\n"));
|
||||
/* Initialize logging. */
|
||||
log_init (opt.lfilename, append_to_log);
|
||||
|
||||
/* Open WARC file. */
|
||||
if (opt.warc_filename != 0)
|
||||
warc_init ();
|
||||
|
||||
DEBUGP (("DEBUG output created by Wget %s on %s.\n\n",
|
||||
version_string, OS_TYPE));
|
||||
|
||||
@ -1472,7 +1567,12 @@ outputting to a regular file.\n"));
|
||||
if (opt.convert_links && !opt.delete_after)
|
||||
convert_all_links ();
|
||||
|
||||
/* Close WARC file. */
|
||||
if (opt.warc_filename != 0)
|
||||
warc_close ();
|
||||
|
||||
log_close ();
|
||||
|
||||
for (i = 0; i < nurl; i++)
|
||||
xfree (url[i]);
|
||||
cleanup ();
|
||||
|
@ -87,6 +87,15 @@ struct options
|
||||
FTP. */
|
||||
char *output_document; /* The output file to which the
|
||||
documents will be printed. */
|
||||
char *warc_filename; /* WARC output filename */
|
||||
char *warc_tempdir; /* WARC temp dir */
|
||||
char *warc_cdx_dedup_filename; /* CDX file to be used for deduplication. */
|
||||
wgint warc_maxsize; /* WARC max archive size */
|
||||
bool warc_compression_enabled; /* For GZIP compression. */
|
||||
bool warc_digests_enabled; /* For SHA1 digests. */
|
||||
bool warc_cdx_enabled; /* Create CDX files? */
|
||||
bool warc_keep_log; /* Store the log file in a WARC record. */
|
||||
char **warc_user_headers; /* User-defined WARC header(s). */
|
||||
|
||||
char *user; /* Generic username */
|
||||
char *passwd; /* Generic password */
|
||||
|
41
src/retr.c
41
src/retr.c
@ -139,13 +139,16 @@ limit_bandwidth (wgint bytes, struct ptimer *timer)
|
||||
|
||||
/* Write data in BUF to OUT. However, if *SKIP is non-zero, skip that
|
||||
amount of data and decrease SKIP. Increment *TOTAL by the amount
|
||||
of data written. */
|
||||
of data written. If OUT2 is not NULL, also write BUF to OUT2.
|
||||
In case of error writing to OUT, -1 is returned. In case of error
|
||||
writing to OUT2, -2 is returned. In case of any other error,
|
||||
1 is returned. */
|
||||
|
||||
static int
|
||||
write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
|
||||
wgint *written)
|
||||
write_data (FILE *out, FILE *out2, const char *buf, int bufsize,
|
||||
wgint *skip, wgint *written)
|
||||
{
|
||||
if (!out)
|
||||
if (out == NULL && out2 == NULL)
|
||||
return 1;
|
||||
if (*skip > bufsize)
|
||||
{
|
||||
@ -161,7 +164,10 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
|
||||
return 1;
|
||||
}
|
||||
|
||||
fwrite (buf, 1, bufsize, out);
|
||||
if (out != NULL)
|
||||
fwrite (buf, 1, bufsize, out);
|
||||
if (out2 != NULL)
|
||||
fwrite (buf, 1, bufsize, out2);
|
||||
*written += bufsize;
|
||||
|
||||
/* Immediately flush the downloaded data. This should not hinder
|
||||
@ -178,9 +184,17 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
|
||||
actual justification. (Also, why 16K? Anyone test other values?)
|
||||
*/
|
||||
#ifndef __VMS
|
||||
fflush (out);
|
||||
if (out != NULL)
|
||||
fflush (out);
|
||||
if (out2 != NULL)
|
||||
fflush (out2);
|
||||
#endif /* ndef __VMS */
|
||||
return !ferror (out);
|
||||
if (out != NULL && ferror (out))
|
||||
return -1;
|
||||
else if (out2 != NULL && ferror (out2))
|
||||
return -2;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Read the contents of file descriptor FD until it the connection
|
||||
@ -198,13 +212,17 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
|
||||
the amount of data written to disk. The time it took to download
|
||||
the data is stored to ELAPSED.
|
||||
|
||||
If OUT2 is non-NULL, the contents is also written to OUT2.
|
||||
|
||||
The function exits and returns the amount of data read. In case of
|
||||
error while reading data, -1 is returned. In case of error while
|
||||
writing data, -2 is returned. */
|
||||
writing data to OUT, -2 is returned. In case of error while writing
|
||||
data to OUT2, -3 is returned. */
|
||||
|
||||
int
|
||||
fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
|
||||
wgint *qtyread, wgint *qtywritten, double *elapsed, int flags)
|
||||
wgint *qtyread, wgint *qtywritten, double *elapsed, int flags,
|
||||
FILE *out2)
|
||||
{
|
||||
int ret = 0;
|
||||
#undef max
|
||||
@ -343,9 +361,10 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
|
||||
if (ret > 0)
|
||||
{
|
||||
sum_read += ret;
|
||||
if (!write_data (out, dlbuf, ret, &skip, &sum_written))
|
||||
int write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written);
|
||||
if (write_res != 0)
|
||||
{
|
||||
ret = -2;
|
||||
ret = (write_res == -3) ? -3 : -2;
|
||||
goto out;
|
||||
}
|
||||
if (chunked)
|
||||
|
@ -50,7 +50,7 @@ enum {
|
||||
rb_chunked_transfer_encoding = 4
|
||||
};
|
||||
|
||||
int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int);
|
||||
int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int, FILE *);
|
||||
|
||||
typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
|
||||
|
||||
|
@ -46,6 +46,8 @@ const char *test_append_uri_pathel();
|
||||
const char *test_are_urls_equal();
|
||||
const char *test_is_robots_txt_url();
|
||||
|
||||
const char *program_argstring = "TEST";
|
||||
|
||||
int tests_run;
|
||||
|
||||
static const char *
|
||||
|
1332
src/warc.c
Normal file
1332
src/warc.c
Normal file
File diff suppressed because it is too large
Load Diff
19
src/warc.h
Normal file
19
src/warc.h
Normal file
@ -0,0 +1,19 @@
|
||||
/* Declarations of WARC helper methods. */
|
||||
#ifndef WARC_H
|
||||
#define WARC_H
|
||||
|
||||
#include "host.h"
|
||||
|
||||
void warc_init ();
|
||||
void warc_close ();
|
||||
void warc_timestamp (char *timestamp);
|
||||
void warc_uuid_str (char *id_str);
|
||||
|
||||
FILE * warc_tempfile ();
|
||||
|
||||
bool warc_write_request_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset);
|
||||
bool warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location);
|
||||
bool warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset);
|
||||
|
||||
#endif /* WARC_H */
|
||||
|
@ -353,7 +353,9 @@ typedef enum
|
||||
PROXERR,
|
||||
/* 50 */
|
||||
AUTHFAILED, QUOTEXC, WRITEFAILED, SSLINITFAILED, VERIFCERTERR,
|
||||
UNLINKERR, NEWLOCATION_KEEP_POST
|
||||
UNLINKERR, NEWLOCATION_KEEP_POST,
|
||||
|
||||
WARC_ERR, WARC_TMP_FOPENERR, WARC_TMP_FWRITEERR
|
||||
} uerr_t;
|
||||
|
||||
/* 2005-02-19 SMS.
|
||||
|
Loading…
Reference in New Issue
Block a user