mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Add support for WARC files.
This commit is contained in:
parent
a5fdba0958
commit
e3820953b2
@ -28,6 +28,7 @@ gnulib_modules="
|
|||||||
accept
|
accept
|
||||||
alloca
|
alloca
|
||||||
announce-gen
|
announce-gen
|
||||||
|
base32
|
||||||
bind
|
bind
|
||||||
c-ctype
|
c-ctype
|
||||||
clock-time
|
clock-time
|
||||||
@ -49,6 +50,7 @@ maintainer-makefile
|
|||||||
mbtowc
|
mbtowc
|
||||||
mkdir
|
mkdir
|
||||||
crypto/md5
|
crypto/md5
|
||||||
|
crypto/sha1
|
||||||
pipe
|
pipe
|
||||||
quote
|
quote
|
||||||
quotearg
|
quotearg
|
||||||
@ -63,6 +65,7 @@ socket
|
|||||||
stdbool
|
stdbool
|
||||||
strcasestr
|
strcasestr
|
||||||
strerror_r-posix
|
strerror_r-posix
|
||||||
|
tmpdir
|
||||||
unlocked-io
|
unlocked-io
|
||||||
update-copyright
|
update-copyright
|
||||||
vasprintf
|
vasprintf
|
||||||
|
12
configure.ac
12
configure.ac
@ -511,6 +511,18 @@ if test "X$iri" != "Xno"; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
dnl
|
||||||
|
dnl Check for UUID
|
||||||
|
dnl
|
||||||
|
|
||||||
|
AC_CHECK_HEADER(uuid/uuid.h,
|
||||||
|
AC_CHECK_LIB(uuid, uuid_generate,
|
||||||
|
[LIBS="${LIBS} -luuid"
|
||||||
|
AC_DEFINE([HAVE_LIBUUID], 1,
|
||||||
|
[Define if libuuid is available.])
|
||||||
|
])
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
dnl Needed by src/Makefile.am
|
dnl Needed by src/Makefile.am
|
||||||
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
|
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
|
2011-11-04 Giuseppe Scrivano <gscrivano@gnu.org>
|
||||||
|
|
||||||
|
|
||||||
2011-10-07 Steven Schweda <address@hidden>
|
2011-10-07 Steven Schweda <address@hidden>
|
||||||
|
|
||||||
* connect.c: Add HAVE_SYS_SELECT_H and HAVE_SYS_SOCKET_H conditions
|
* connect.c: Add HAVE_SYS_SELECT_H and HAVE_SYS_SOCKET_H conditions
|
||||||
@ -21,7 +24,10 @@
|
|||||||
* openssl.c (ssl_init): Add type cast (SSL_METHOD *) to newly "const"
|
* openssl.c (ssl_init): Add type cast (SSL_METHOD *) to newly "const"
|
||||||
"meth" argument to accommodate OpenSSL version 0.9.8, where that
|
"meth" argument to accommodate OpenSSL version 0.9.8, where that
|
||||||
argument is not "const" in the OpenSSL function (SSL_CTX_new).
|
argument is not "const" in the OpenSSL function (SSL_CTX_new).
|
||||||
|
* test.c: Declare "program_argstring".
|
||||||
* utils.c (fopen_excl): Comment typography.
|
* utils.c (fopen_excl): Comment typography.
|
||||||
|
* warc.h: New file.
|
||||||
|
* warc.c: New file.
|
||||||
|
|
||||||
2011-10-02 Henrik Holst <henrik.holst@millistream.com> (tiny change)
|
2011-10-02 Henrik Holst <henrik.holst@millistream.com> (tiny change)
|
||||||
* http.c (gethttp): If 'contentonerror' is used then do not
|
* http.c (gethttp): If 'contentonerror' is used then do not
|
||||||
|
@ -46,13 +46,13 @@ wget_SOURCES = cmpt.c connect.c convert.c cookies.c ftp.c \
|
|||||||
css_.c css-url.c \
|
css_.c css-url.c \
|
||||||
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
|
ftp-basic.c ftp-ls.c hash.c host.c html-parse.c html-url.c \
|
||||||
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
|
http.c init.c log.c main.c netrc.c progress.c ptimer.c \
|
||||||
recur.c res.c retr.c spider.c url.c \
|
recur.c res.c retr.c spider.c url.c warc.c \
|
||||||
utils.c exits.c build_info.c $(IRI_OBJ) \
|
utils.c exits.c build_info.c $(IRI_OBJ) \
|
||||||
css-url.h css-tokens.h connect.h convert.h cookies.h \
|
css-url.h css-tokens.h connect.h convert.h cookies.h \
|
||||||
ftp.h hash.h host.h html-parse.h html-url.h \
|
ftp.h hash.h host.h html-parse.h html-url.h \
|
||||||
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
|
http.h http-ntlm.h init.h log.h mswindows.h netrc.h \
|
||||||
options.h progress.h ptimer.h recur.h res.h retr.h \
|
options.h progress.h ptimer.h recur.h res.h retr.h \
|
||||||
spider.h ssl.h sysdep.h url.h utils.h wget.h iri.h \
|
spider.h ssl.h sysdep.h url.h warc.h utils.h wget.h iri.h \
|
||||||
exits.h gettext.h
|
exits.h gettext.h
|
||||||
nodist_wget_SOURCES = version.c
|
nodist_wget_SOURCES = version.c
|
||||||
EXTRA_wget_SOURCES = iri.c
|
EXTRA_wget_SOURCES = iri.c
|
||||||
|
61
src/ftp.c
61
src/ftp.c
@ -49,6 +49,7 @@ as that of the covered work. */
|
|||||||
#include "netrc.h"
|
#include "netrc.h"
|
||||||
#include "convert.h" /* for downloaded_file */
|
#include "convert.h" /* for downloaded_file */
|
||||||
#include "recur.h" /* for INFINITE_RECURSION */
|
#include "recur.h" /* for INFINITE_RECURSION */
|
||||||
|
#include "warc.h"
|
||||||
|
|
||||||
#ifdef __VMS
|
#ifdef __VMS
|
||||||
# include "vms.h"
|
# include "vms.h"
|
||||||
@ -237,10 +238,11 @@ static uerr_t ftp_get_listing (struct url *, ccon *, struct fileinfo **);
|
|||||||
|
|
||||||
/* Retrieves a file with denoted parameters through opening an FTP
|
/* Retrieves a file with denoted parameters through opening an FTP
|
||||||
connection to the server. It always closes the data connection,
|
connection to the server. It always closes the data connection,
|
||||||
and closes the control connection in case of error. */
|
and closes the control connection in case of error. If warc_tmp
|
||||||
|
is non-NULL, the downloaded data will be written there as well. */
|
||||||
static uerr_t
|
static uerr_t
|
||||||
getftp (struct url *u, wgint passed_expected_bytes, wgint *qtyread,
|
getftp (struct url *u, wgint passed_expected_bytes, wgint *qtyread,
|
||||||
wgint restval, ccon *con, int count)
|
wgint restval, ccon *con, int count, FILE *warc_tmp)
|
||||||
{
|
{
|
||||||
int csock, dtsock, local_sock, res;
|
int csock, dtsock, local_sock, res;
|
||||||
uerr_t err = RETROK; /* appease the compiler */
|
uerr_t err = RETROK; /* appease the compiler */
|
||||||
@ -1274,7 +1276,7 @@ Error in server response, closing control connection.\n"));
|
|||||||
rd_size = 0;
|
rd_size = 0;
|
||||||
res = fd_read_body (dtsock, fp,
|
res = fd_read_body (dtsock, fp,
|
||||||
expected_bytes ? expected_bytes - restval : 0,
|
expected_bytes ? expected_bytes - restval : 0,
|
||||||
restval, &rd_size, qtyread, &con->dltime, flags);
|
restval, &rd_size, qtyread, &con->dltime, flags, warc_tmp);
|
||||||
|
|
||||||
tms = datetime_str (time (NULL));
|
tms = datetime_str (time (NULL));
|
||||||
tmrate = retr_rate (rd_size, con->dltime);
|
tmrate = retr_rate (rd_size, con->dltime);
|
||||||
@ -1285,15 +1287,18 @@ Error in server response, closing control connection.\n"));
|
|||||||
if (!output_stream || con->cmd & DO_LIST)
|
if (!output_stream || con->cmd & DO_LIST)
|
||||||
fclose (fp);
|
fclose (fp);
|
||||||
|
|
||||||
/* If fd_read_body couldn't write to fp, bail out. */
|
/* If fd_read_body couldn't write to fp or warc_tmp, bail out. */
|
||||||
if (res == -2)
|
if (res == -2 || (warc_tmp != NULL && res == -3))
|
||||||
{
|
{
|
||||||
logprintf (LOG_NOTQUIET, _("%s: %s, closing control connection.\n"),
|
logprintf (LOG_NOTQUIET, _("%s: %s, closing control connection.\n"),
|
||||||
con->target, strerror (errno));
|
con->target, strerror (errno));
|
||||||
fd_close (csock);
|
fd_close (csock);
|
||||||
con->csock = -1;
|
con->csock = -1;
|
||||||
fd_close (dtsock);
|
fd_close (dtsock);
|
||||||
|
if (res == -2)
|
||||||
return FWRITEERR;
|
return FWRITEERR;
|
||||||
|
else if (res == -3)
|
||||||
|
return WARC_TMP_FWRITEERR;
|
||||||
}
|
}
|
||||||
else if (res == -1)
|
else if (res == -1)
|
||||||
{
|
{
|
||||||
@ -1409,6 +1414,11 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
|
|||||||
uerr_t err;
|
uerr_t err;
|
||||||
struct_stat st;
|
struct_stat st;
|
||||||
|
|
||||||
|
/* Declare WARC variables. */
|
||||||
|
bool warc_enabled = (opt.warc_filename != NULL);
|
||||||
|
FILE *warc_tmp = NULL;
|
||||||
|
ip_address *warc_ip = NULL;
|
||||||
|
|
||||||
/* Get the target, and set the name for the message accordingly. */
|
/* Get the target, and set the name for the message accordingly. */
|
||||||
if ((f == NULL) && (con->target))
|
if ((f == NULL) && (con->target))
|
||||||
{
|
{
|
||||||
@ -1445,6 +1455,21 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
|
|||||||
|
|
||||||
orig_lp = con->cmd & LEAVE_PENDING ? 1 : 0;
|
orig_lp = con->cmd & LEAVE_PENDING ? 1 : 0;
|
||||||
|
|
||||||
|
/* For file RETR requests, we can write a WARC record.
|
||||||
|
We record the file contents to a temporary file. */
|
||||||
|
if (warc_enabled && (con->cmd & DO_RETR))
|
||||||
|
{
|
||||||
|
warc_tmp = warc_tempfile ();
|
||||||
|
if (warc_tmp == NULL)
|
||||||
|
return WARC_TMP_FOPENERR;
|
||||||
|
|
||||||
|
if (!con->proxy && con->csock != -1)
|
||||||
|
{
|
||||||
|
warc_ip = (ip_address *) alloca (sizeof (ip_address));
|
||||||
|
socket_ip_address (con->csock, warc_ip, ENDPOINT_PEER);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* THE loop. */
|
/* THE loop. */
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
@ -1509,7 +1534,10 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
|
|||||||
len = f->size;
|
len = f->size;
|
||||||
else
|
else
|
||||||
len = 0;
|
len = 0;
|
||||||
err = getftp (u, len, &qtyread, restval, con, count);
|
|
||||||
|
/* If we are working on a WARC record, getftp should also write
|
||||||
|
to the warc_tmp file. */
|
||||||
|
err = getftp (u, len, &qtyread, restval, con, count, warc_tmp);
|
||||||
|
|
||||||
if (con->csock == -1)
|
if (con->csock == -1)
|
||||||
con->st &= ~DONE_CWD;
|
con->st &= ~DONE_CWD;
|
||||||
@ -1520,8 +1548,10 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
|
|||||||
{
|
{
|
||||||
case HOSTERR: case CONIMPOSSIBLE: case FWRITEERR: case FOPENERR:
|
case HOSTERR: case CONIMPOSSIBLE: case FWRITEERR: case FOPENERR:
|
||||||
case FTPNSFOD: case FTPLOGINC: case FTPNOPASV: case CONTNOTSUPPORTED:
|
case FTPNSFOD: case FTPLOGINC: case FTPNOPASV: case CONTNOTSUPPORTED:
|
||||||
case UNLINKERR:
|
case UNLINKERR: case WARC_TMP_FWRITEERR:
|
||||||
/* Fatal errors, give up. */
|
/* Fatal errors, give up. */
|
||||||
|
if (warc_tmp != NULL)
|
||||||
|
fclose (warc_tmp);
|
||||||
return err;
|
return err;
|
||||||
case CONSOCKERR: case CONERROR: case FTPSRVERR: case FTPRERR:
|
case CONSOCKERR: case CONERROR: case FTPSRVERR: case FTPRERR:
|
||||||
case WRITEFAILED: case FTPUNKNOWNTYPE: case FTPSYSERR:
|
case WRITEFAILED: case FTPUNKNOWNTYPE: case FTPSYSERR:
|
||||||
@ -1589,6 +1619,19 @@ ftp_loop_internal (struct url *u, struct fileinfo *f, ccon *con, char **local_fi
|
|||||||
xfree (hurl);
|
xfree (hurl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (warc_enabled && (con->cmd & DO_RETR))
|
||||||
|
{
|
||||||
|
/* Create and store a WARC resource record for the retrieved file. */
|
||||||
|
bool warc_res;
|
||||||
|
|
||||||
|
warc_res = warc_write_resource_record (NULL, u->url, NULL, NULL,
|
||||||
|
warc_ip, NULL, warc_tmp, -1);
|
||||||
|
if (! warc_res)
|
||||||
|
return WARC_ERR;
|
||||||
|
|
||||||
|
/* warc_write_resource_record has also closed warc_tmp. */
|
||||||
|
}
|
||||||
|
|
||||||
if ((con->cmd & DO_LIST))
|
if ((con->cmd & DO_LIST))
|
||||||
/* This is a directory listing file. */
|
/* This is a directory listing file. */
|
||||||
{
|
{
|
||||||
@ -1928,7 +1971,9 @@ Already have correct symlink %s -> %s\n\n"),
|
|||||||
xfree (ofile);
|
xfree (ofile);
|
||||||
|
|
||||||
/* Break on fatals. */
|
/* Break on fatals. */
|
||||||
if (err == QUOTEXC || err == HOSTERR || err == FWRITEERR)
|
if (err == QUOTEXC || err == HOSTERR || err == FWRITEERR
|
||||||
|
|| err == WARC_ERR || err == WARC_TMP_FOPENERR
|
||||||
|
|| err == WARC_TMP_FWRITEERR)
|
||||||
break;
|
break;
|
||||||
con->cmd &= ~ (DO_CWD | DO_LOGIN);
|
con->cmd &= ~ (DO_CWD | DO_LOGIN);
|
||||||
f = f->next;
|
f = f->next;
|
||||||
|
392
src/http.c
392
src/http.c
@ -58,6 +58,7 @@ as that of the covered work. */
|
|||||||
#include "md5.h"
|
#include "md5.h"
|
||||||
#include "convert.h"
|
#include "convert.h"
|
||||||
#include "spider.h"
|
#include "spider.h"
|
||||||
|
#include "warc.h"
|
||||||
|
|
||||||
#ifdef TESTING
|
#ifdef TESTING
|
||||||
#include "test.h"
|
#include "test.h"
|
||||||
@ -320,10 +321,12 @@ request_remove_header (struct request *req, char *name)
|
|||||||
p += A_len; \
|
p += A_len; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/* Construct the request and write it to FD using fd_write. */
|
/* Construct the request and write it to FD using fd_write.
|
||||||
|
If warc_tmp is set to a file pointer, the request string will
|
||||||
|
also be written to that file. */
|
||||||
|
|
||||||
static int
|
static int
|
||||||
request_send (const struct request *req, int fd)
|
request_send (const struct request *req, int fd, FILE *warc_tmp)
|
||||||
{
|
{
|
||||||
char *request_string, *p;
|
char *request_string, *p;
|
||||||
int i, size, write_error;
|
int i, size, write_error;
|
||||||
@ -374,6 +377,13 @@ request_send (const struct request *req, int fd)
|
|||||||
if (write_error < 0)
|
if (write_error < 0)
|
||||||
logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
|
logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
|
||||||
fd_errstr (fd));
|
fd_errstr (fd));
|
||||||
|
else if (warc_tmp != NULL)
|
||||||
|
{
|
||||||
|
/* Write a copy of the data to the WARC record. */
|
||||||
|
int warc_tmp_written = fwrite (request_string, 1, size - 1, warc_tmp);
|
||||||
|
if (warc_tmp_written != size - 1)
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
return write_error;
|
return write_error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -444,10 +454,12 @@ register_basic_auth_host (const char *hostname)
|
|||||||
|
|
||||||
/* Send the contents of FILE_NAME to SOCK. Make sure that exactly
|
/* Send the contents of FILE_NAME to SOCK. Make sure that exactly
|
||||||
PROMISED_SIZE bytes are sent over the wire -- if the file is
|
PROMISED_SIZE bytes are sent over the wire -- if the file is
|
||||||
longer, read only that much; if the file is shorter, report an error. */
|
longer, read only that much; if the file is shorter, report an error.
|
||||||
|
If warc_tmp is set to a file pointer, the post data will
|
||||||
|
also be written to that file. */
|
||||||
|
|
||||||
static int
|
static int
|
||||||
post_file (int sock, const char *file_name, wgint promised_size)
|
post_file (int sock, const char *file_name, wgint promised_size, FILE *warc_tmp)
|
||||||
{
|
{
|
||||||
static char chunk[8192];
|
static char chunk[8192];
|
||||||
wgint written = 0;
|
wgint written = 0;
|
||||||
@ -472,6 +484,16 @@ post_file (int sock, const char *file_name, wgint promised_size)
|
|||||||
fclose (fp);
|
fclose (fp);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
if (warc_tmp != NULL)
|
||||||
|
{
|
||||||
|
/* Write a copy of the data to the WARC record. */
|
||||||
|
int warc_tmp_written = fwrite (chunk, 1, towrite, warc_tmp);
|
||||||
|
if (warc_tmp_written != towrite)
|
||||||
|
{
|
||||||
|
fclose (fp);
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
|
}
|
||||||
written += towrite;
|
written += towrite;
|
||||||
}
|
}
|
||||||
fclose (fp);
|
fclose (fp);
|
||||||
@ -1462,6 +1484,135 @@ File %s already there; not retrieving.\n\n"), quote (filename));
|
|||||||
*dt |= TEXTHTML;
|
*dt |= TEXTHTML;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Download the response body from the socket and writes it to
|
||||||
|
an output file. The headers have already been read from the
|
||||||
|
socket. If WARC is enabled, the response body will also be
|
||||||
|
written to a WARC response record.
|
||||||
|
|
||||||
|
hs, contlen, contrange, chunked_transfer_encoding and url are
|
||||||
|
parameters from the gethttp method. fp is a pointer to the
|
||||||
|
output file.
|
||||||
|
|
||||||
|
url, warc_timestamp_str, warc_request_uuid, warc_ip, type
|
||||||
|
and statcode will be saved in the headers of the WARC record.
|
||||||
|
The head parameter contains the HTTP headers of the response.
|
||||||
|
|
||||||
|
If fp is NULL and WARC is enabled, the response body will be
|
||||||
|
written only to the WARC file. If WARC is disabled and fp
|
||||||
|
is a file pointer, the data will be written to the file.
|
||||||
|
If fp is a file pointer and WARC is enabled, the body will
|
||||||
|
be written to both destinations.
|
||||||
|
|
||||||
|
Returns the error code. */
|
||||||
|
static int
|
||||||
|
read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen,
|
||||||
|
wgint contrange, bool chunked_transfer_encoding,
|
||||||
|
char *url, char *warc_timestamp_str, char *warc_request_uuid,
|
||||||
|
ip_address *warc_ip, char *type, int statcode, char *head)
|
||||||
|
{
|
||||||
|
int warc_payload_offset = 0;
|
||||||
|
FILE *warc_tmp = NULL;
|
||||||
|
int warcerr = 0;
|
||||||
|
|
||||||
|
if (opt.warc_filename != NULL)
|
||||||
|
{
|
||||||
|
/* Open a temporary file where we can write the response before we
|
||||||
|
add it to the WARC record. */
|
||||||
|
warc_tmp = warc_tempfile ();
|
||||||
|
if (warc_tmp == NULL)
|
||||||
|
warcerr = WARC_TMP_FOPENERR;
|
||||||
|
|
||||||
|
if (warcerr == 0)
|
||||||
|
{
|
||||||
|
/* We should keep the response headers for the WARC record. */
|
||||||
|
int head_len = strlen (head);
|
||||||
|
int warc_tmp_written = fwrite (head, 1, head_len, warc_tmp);
|
||||||
|
if (warc_tmp_written != head_len)
|
||||||
|
warcerr = WARC_TMP_FWRITEERR;
|
||||||
|
warc_payload_offset = head_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (warcerr != 0)
|
||||||
|
{
|
||||||
|
if (warc_tmp != NULL)
|
||||||
|
fclose (warc_tmp);
|
||||||
|
return warcerr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fp != NULL)
|
||||||
|
{
|
||||||
|
/* This confuses the timestamping code that checks for file size.
|
||||||
|
#### The timestamping code should be smarter about file size. */
|
||||||
|
if (opt.save_headers && hs->restval == 0)
|
||||||
|
fwrite (head, 1, strlen (head), fp);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read the response body. */
|
||||||
|
int flags = 0;
|
||||||
|
if (contlen != -1)
|
||||||
|
/* If content-length is present, read that much; otherwise, read
|
||||||
|
until EOF. The HTTP spec doesn't require the server to
|
||||||
|
actually close the connection when it's done sending data. */
|
||||||
|
flags |= rb_read_exactly;
|
||||||
|
if (fp != NULL && hs->restval > 0 && contrange == 0)
|
||||||
|
/* If the server ignored our range request, instruct fd_read_body
|
||||||
|
to skip the first RESTVAL bytes of body. */
|
||||||
|
flags |= rb_skip_startpos;
|
||||||
|
if (chunked_transfer_encoding)
|
||||||
|
flags |= rb_chunked_transfer_encoding;
|
||||||
|
|
||||||
|
hs->len = hs->restval;
|
||||||
|
hs->rd_size = 0;
|
||||||
|
/* Download the response body and write it to fp.
|
||||||
|
If we are working on a WARC file, we simultaneously write the
|
||||||
|
response body to warc_tmp. */
|
||||||
|
hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0,
|
||||||
|
hs->restval, &hs->rd_size, &hs->len, &hs->dltime,
|
||||||
|
flags, warc_tmp);
|
||||||
|
if (hs->res >= 0)
|
||||||
|
{
|
||||||
|
if (warc_tmp != NULL)
|
||||||
|
{
|
||||||
|
/* Create a response record and write it to the WARC file.
|
||||||
|
Note: per the WARC standard, the request and response should share
|
||||||
|
the same date header. We re-use the timestamp of the request.
|
||||||
|
The response record should also refer to the uuid of the request. */
|
||||||
|
bool r = warc_write_response_record (url, warc_timestamp_str,
|
||||||
|
warc_request_uuid, warc_ip,
|
||||||
|
warc_tmp, warc_payload_offset,
|
||||||
|
type, statcode, hs->newloc);
|
||||||
|
|
||||||
|
/* warc_write_response_record has closed warc_tmp. */
|
||||||
|
|
||||||
|
if (! r)
|
||||||
|
return WARC_ERR;
|
||||||
|
}
|
||||||
|
|
||||||
|
return RETRFINISHED;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (warc_tmp != NULL)
|
||||||
|
fclose (warc_tmp);
|
||||||
|
|
||||||
|
if (hs->res == -2)
|
||||||
|
{
|
||||||
|
/* Error while writing to fd. */
|
||||||
|
return FWRITEERR;
|
||||||
|
}
|
||||||
|
else if (hs->res == -3)
|
||||||
|
{
|
||||||
|
/* Error while writing to warc_tmp. */
|
||||||
|
return WARC_TMP_FWRITEERR;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* A read error! */
|
||||||
|
hs->rderrmsg = xstrdup (fd_errstr (sock));
|
||||||
|
return RETRFINISHED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define BEGINS_WITH(line, string_constant) \
|
#define BEGINS_WITH(line, string_constant) \
|
||||||
(!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \
|
(!strncasecmp (line, string_constant, sizeof (string_constant) - 1) \
|
||||||
&& (c_isspace (line[sizeof (string_constant) - 1]) \
|
&& (c_isspace (line[sizeof (string_constant) - 1]) \
|
||||||
@ -1519,9 +1670,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
|||||||
wgint contlen, contrange;
|
wgint contlen, contrange;
|
||||||
struct url *conn;
|
struct url *conn;
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
|
int err;
|
||||||
|
|
||||||
int sock = -1;
|
int sock = -1;
|
||||||
int flags;
|
|
||||||
|
|
||||||
/* Set to 1 when the authorization has already been sent and should
|
/* Set to 1 when the authorization has already been sent and should
|
||||||
not be tried again. */
|
not be tried again. */
|
||||||
@ -1547,6 +1698,14 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
|||||||
char hdrval[256];
|
char hdrval[256];
|
||||||
char *message;
|
char *message;
|
||||||
|
|
||||||
|
/* Declare WARC variables. */
|
||||||
|
bool warc_enabled = (opt.warc_filename != NULL);
|
||||||
|
FILE *warc_tmp = NULL;
|
||||||
|
char warc_timestamp_str [21];
|
||||||
|
char warc_request_uuid [48];
|
||||||
|
ip_address *warc_ip = NULL;
|
||||||
|
long int warc_payload_offset = -1;
|
||||||
|
|
||||||
/* Whether this connection will be kept alive after the HTTP request
|
/* Whether this connection will be kept alive after the HTTP request
|
||||||
is done. */
|
is done. */
|
||||||
bool keep_alive;
|
bool keep_alive;
|
||||||
@ -1852,7 +2011,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
|||||||
that the contents of Host would be exactly the same as
|
that the contents of Host would be exactly the same as
|
||||||
the contents of CONNECT. */
|
the contents of CONNECT. */
|
||||||
|
|
||||||
write_error = request_send (connreq, sock);
|
write_error = request_send (connreq, sock, 0);
|
||||||
request_free (connreq);
|
request_free (connreq);
|
||||||
if (write_error < 0)
|
if (write_error < 0)
|
||||||
{
|
{
|
||||||
@ -1924,8 +2083,26 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
|||||||
#endif /* HAVE_SSL */
|
#endif /* HAVE_SSL */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Open the temporary file where we will write the request. */
|
||||||
|
if (warc_enabled)
|
||||||
|
{
|
||||||
|
warc_tmp = warc_tempfile ();
|
||||||
|
if (warc_tmp == NULL)
|
||||||
|
{
|
||||||
|
CLOSE_INVALIDATE (sock);
|
||||||
|
request_free (req);
|
||||||
|
return WARC_TMP_FOPENERR;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (! proxy)
|
||||||
|
{
|
||||||
|
warc_ip = (ip_address *) alloca (sizeof (ip_address));
|
||||||
|
socket_ip_address (sock, warc_ip, ENDPOINT_PEER);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Send the request to server. */
|
/* Send the request to server. */
|
||||||
write_error = request_send (req, sock);
|
write_error = request_send (req, sock, warc_tmp);
|
||||||
|
|
||||||
if (write_error >= 0)
|
if (write_error >= 0)
|
||||||
{
|
{
|
||||||
@ -1933,15 +2110,38 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
|||||||
{
|
{
|
||||||
DEBUGP (("[POST data: %s]\n", opt.post_data));
|
DEBUGP (("[POST data: %s]\n", opt.post_data));
|
||||||
write_error = fd_write (sock, opt.post_data, post_data_size, -1);
|
write_error = fd_write (sock, opt.post_data, post_data_size, -1);
|
||||||
|
if (write_error >= 0 && warc_tmp != NULL)
|
||||||
|
{
|
||||||
|
/* Remember end of headers / start of payload. */
|
||||||
|
warc_payload_offset = ftell (warc_tmp);
|
||||||
|
|
||||||
|
/* Write a copy of the data to the WARC record. */
|
||||||
|
int warc_tmp_written = fwrite (opt.post_data, 1, post_data_size, warc_tmp);
|
||||||
|
if (warc_tmp_written != post_data_size)
|
||||||
|
write_error = -2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (opt.post_file_name && post_data_size != 0)
|
else if (opt.post_file_name && post_data_size != 0)
|
||||||
write_error = post_file (sock, opt.post_file_name, post_data_size);
|
{
|
||||||
|
if (warc_tmp != NULL)
|
||||||
|
/* Remember end of headers / start of payload. */
|
||||||
|
warc_payload_offset = ftell (warc_tmp);
|
||||||
|
|
||||||
|
write_error = post_file (sock, opt.post_file_name, post_data_size, warc_tmp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (write_error < 0)
|
if (write_error < 0)
|
||||||
{
|
{
|
||||||
CLOSE_INVALIDATE (sock);
|
CLOSE_INVALIDATE (sock);
|
||||||
request_free (req);
|
request_free (req);
|
||||||
|
|
||||||
|
if (warc_tmp != NULL)
|
||||||
|
fclose (warc_tmp);
|
||||||
|
|
||||||
|
if (write_error == -2)
|
||||||
|
return WARC_TMP_FWRITEERR;
|
||||||
|
else
|
||||||
return WRITEFAILED;
|
return WRITEFAILED;
|
||||||
}
|
}
|
||||||
logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
|
logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
|
||||||
@ -1950,6 +2150,29 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
|||||||
contrange = 0;
|
contrange = 0;
|
||||||
*dt &= ~RETROKF;
|
*dt &= ~RETROKF;
|
||||||
|
|
||||||
|
|
||||||
|
if (warc_enabled)
|
||||||
|
{
|
||||||
|
bool warc_result;
|
||||||
|
/* Generate a timestamp and uuid for this request. */
|
||||||
|
warc_timestamp (warc_timestamp_str);
|
||||||
|
warc_uuid_str (warc_request_uuid);
|
||||||
|
|
||||||
|
/* Create a request record and store it in the WARC file. */
|
||||||
|
warc_result = warc_write_request_record (u->url, warc_timestamp_str,
|
||||||
|
warc_request_uuid, warc_ip,
|
||||||
|
warc_tmp, warc_payload_offset);
|
||||||
|
if (! warc_result)
|
||||||
|
{
|
||||||
|
CLOSE_INVALIDATE (sock);
|
||||||
|
request_free (req);
|
||||||
|
return WARC_ERR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* warc_write_request_record has also closed warc_tmp. */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
read_header:
|
read_header:
|
||||||
head = read_http_response_head (sock);
|
head = read_http_response_head (sock);
|
||||||
if (!head)
|
if (!head)
|
||||||
@ -2073,11 +2296,42 @@ read_header:
|
|||||||
if (statcode == HTTP_STATUS_UNAUTHORIZED)
|
if (statcode == HTTP_STATUS_UNAUTHORIZED)
|
||||||
{
|
{
|
||||||
/* Authorization is required. */
|
/* Authorization is required. */
|
||||||
|
|
||||||
|
/* Normally we are not interested in the response body.
|
||||||
|
But if we are writing a WARC file we are: we like to keep everyting. */
|
||||||
|
if (warc_enabled)
|
||||||
|
{
|
||||||
|
int err;
|
||||||
|
type = resp_header_strdup (resp, "Content-Type");
|
||||||
|
err = read_response_body (hs, sock, NULL, contlen, 0,
|
||||||
|
chunked_transfer_encoding,
|
||||||
|
u->url, warc_timestamp_str,
|
||||||
|
warc_request_uuid, warc_ip, type,
|
||||||
|
statcode, head);
|
||||||
|
xfree_null (type);
|
||||||
|
|
||||||
|
if (err != RETRFINISHED || hs->res < 0)
|
||||||
|
{
|
||||||
|
CLOSE_INVALIDATE (sock);
|
||||||
|
request_free (req);
|
||||||
|
xfree_null (message);
|
||||||
|
resp_free (resp);
|
||||||
|
xfree (head);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
CLOSE_FINISH (sock);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Since WARC is disabled, we are not interested in the response body. */
|
||||||
if (keep_alive && !head_only
|
if (keep_alive && !head_only
|
||||||
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
|
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
|
||||||
CLOSE_FINISH (sock);
|
CLOSE_FINISH (sock);
|
||||||
else
|
else
|
||||||
CLOSE_INVALIDATE (sock);
|
CLOSE_INVALIDATE (sock);
|
||||||
|
}
|
||||||
|
|
||||||
pconn.authorized = false;
|
pconn.authorized = false;
|
||||||
if (!auth_finished && (user && passwd))
|
if (!auth_finished && (user && passwd))
|
||||||
{
|
{
|
||||||
@ -2325,11 +2579,42 @@ read_header:
|
|||||||
_("Location: %s%s\n"),
|
_("Location: %s%s\n"),
|
||||||
hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"),
|
hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"),
|
||||||
hs->newloc ? _(" [following]") : "");
|
hs->newloc ? _(" [following]") : "");
|
||||||
|
|
||||||
|
/* In case the caller cares to look... */
|
||||||
|
hs->len = 0;
|
||||||
|
hs->res = 0;
|
||||||
|
hs->restval = 0;
|
||||||
|
|
||||||
|
/* Normally we are not interested in the response body of a redirect.
|
||||||
|
But if we are writing a WARC file we are: we like to keep everyting. */
|
||||||
|
if (warc_enabled)
|
||||||
|
{
|
||||||
|
int err = read_response_body (hs, sock, NULL, contlen, 0,
|
||||||
|
chunked_transfer_encoding,
|
||||||
|
u->url, warc_timestamp_str,
|
||||||
|
warc_request_uuid, warc_ip, type,
|
||||||
|
statcode, head);
|
||||||
|
|
||||||
|
if (err != RETRFINISHED || hs->res < 0)
|
||||||
|
{
|
||||||
|
CLOSE_INVALIDATE (sock);
|
||||||
|
xfree_null (type);
|
||||||
|
xfree (head);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
CLOSE_FINISH (sock);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Since WARC is disabled, we are not interested in the response body. */
|
||||||
if (keep_alive && !head_only
|
if (keep_alive && !head_only
|
||||||
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
|
&& skip_short_body (sock, contlen, chunked_transfer_encoding))
|
||||||
CLOSE_FINISH (sock);
|
CLOSE_FINISH (sock);
|
||||||
else
|
else
|
||||||
CLOSE_INVALIDATE (sock);
|
CLOSE_INVALIDATE (sock);
|
||||||
|
}
|
||||||
|
|
||||||
xfree_null (type);
|
xfree_null (type);
|
||||||
xfree (head);
|
xfree (head);
|
||||||
/* From RFC2616: The status codes 303 and 307 have
|
/* From RFC2616: The status codes 303 and 307 have
|
||||||
@ -2447,8 +2732,6 @@ read_header:
|
|||||||
logputs (LOG_VERBOSE, "\n");
|
logputs (LOG_VERBOSE, "\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
xfree_null (type);
|
|
||||||
type = NULL; /* We don't need it any more. */
|
|
||||||
|
|
||||||
/* Return if we have no intention of further downloading. */
|
/* Return if we have no intention of further downloading. */
|
||||||
if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only)
|
if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only)
|
||||||
@ -2456,7 +2739,31 @@ read_header:
|
|||||||
/* In case the caller cares to look... */
|
/* In case the caller cares to look... */
|
||||||
hs->len = 0;
|
hs->len = 0;
|
||||||
hs->res = 0;
|
hs->res = 0;
|
||||||
|
hs->restval = 0;
|
||||||
|
|
||||||
|
/* Normally we are not interested in the response body of a error responses.
|
||||||
|
But if we are writing a WARC file we are: we like to keep everyting. */
|
||||||
|
if (warc_enabled)
|
||||||
|
{
|
||||||
|
int err = read_response_body (hs, sock, NULL, contlen, 0,
|
||||||
|
chunked_transfer_encoding,
|
||||||
|
u->url, warc_timestamp_str,
|
||||||
|
warc_request_uuid, warc_ip, type,
|
||||||
|
statcode, head);
|
||||||
|
|
||||||
|
if (err != RETRFINISHED || hs->res < 0)
|
||||||
|
{
|
||||||
|
CLOSE_INVALIDATE (sock);
|
||||||
|
xfree (head);
|
||||||
xfree_null (type);
|
xfree_null (type);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
CLOSE_FINISH (sock);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Since WARC is disabled, we are not interested in the response body. */
|
||||||
if (head_only)
|
if (head_only)
|
||||||
/* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the
|
/* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the
|
||||||
servers not to send body in response to a HEAD request, and
|
servers not to send body in response to a HEAD request, and
|
||||||
@ -2470,7 +2777,10 @@ read_header:
|
|||||||
CLOSE_FINISH (sock);
|
CLOSE_FINISH (sock);
|
||||||
else
|
else
|
||||||
CLOSE_INVALIDATE (sock);
|
CLOSE_INVALIDATE (sock);
|
||||||
|
}
|
||||||
|
|
||||||
xfree (head);
|
xfree (head);
|
||||||
|
xfree_null (type);
|
||||||
return RETRFINISHED;
|
return RETRFINISHED;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2512,6 +2822,7 @@ read_header:
|
|||||||
strerror (errno));
|
strerror (errno));
|
||||||
CLOSE_INVALIDATE (sock);
|
CLOSE_INVALIDATE (sock);
|
||||||
xfree (head);
|
xfree (head);
|
||||||
|
xfree_null (type);
|
||||||
return UNLINKERR;
|
return UNLINKERR;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2539,6 +2850,7 @@ read_header:
|
|||||||
hs->local_file);
|
hs->local_file);
|
||||||
CLOSE_INVALIDATE (sock);
|
CLOSE_INVALIDATE (sock);
|
||||||
xfree (head);
|
xfree (head);
|
||||||
|
xfree_null (type);
|
||||||
return FOPEN_EXCL_ERR;
|
return FOPEN_EXCL_ERR;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2547,6 +2859,7 @@ read_header:
|
|||||||
logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno));
|
logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno));
|
||||||
CLOSE_INVALIDATE (sock);
|
CLOSE_INVALIDATE (sock);
|
||||||
xfree (head);
|
xfree (head);
|
||||||
|
xfree_null (type);
|
||||||
return FOPENERR;
|
return FOPENERR;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2560,49 +2873,26 @@ read_header:
|
|||||||
HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file));
|
HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This confuses the timestamping code that checks for file size.
|
|
||||||
#### The timestamping code should be smarter about file size. */
|
err = read_response_body (hs, sock, fp, contlen, contrange,
|
||||||
if (opt.save_headers && hs->restval == 0)
|
chunked_transfer_encoding,
|
||||||
fwrite (head, 1, strlen (head), fp);
|
u->url, warc_timestamp_str,
|
||||||
|
warc_request_uuid, warc_ip, type,
|
||||||
|
statcode, head);
|
||||||
|
|
||||||
/* Now we no longer need to store the response header. */
|
/* Now we no longer need to store the response header. */
|
||||||
xfree (head);
|
xfree (head);
|
||||||
|
xfree_null (type);
|
||||||
/* Download the request body. */
|
|
||||||
flags = 0;
|
|
||||||
if (contlen != -1)
|
|
||||||
/* If content-length is present, read that much; otherwise, read
|
|
||||||
until EOF. The HTTP spec doesn't require the server to
|
|
||||||
actually close the connection when it's done sending data. */
|
|
||||||
flags |= rb_read_exactly;
|
|
||||||
if (hs->restval > 0 && contrange == 0)
|
|
||||||
/* If the server ignored our range request, instruct fd_read_body
|
|
||||||
to skip the first RESTVAL bytes of body. */
|
|
||||||
flags |= rb_skip_startpos;
|
|
||||||
|
|
||||||
if (chunked_transfer_encoding)
|
|
||||||
flags |= rb_chunked_transfer_encoding;
|
|
||||||
|
|
||||||
hs->len = hs->restval;
|
|
||||||
hs->rd_size = 0;
|
|
||||||
hs->res = fd_read_body (sock, fp, contlen != -1 ? contlen : 0,
|
|
||||||
hs->restval, &hs->rd_size, &hs->len, &hs->dltime,
|
|
||||||
flags);
|
|
||||||
|
|
||||||
if (hs->res >= 0)
|
if (hs->res >= 0)
|
||||||
CLOSE_FINISH (sock);
|
CLOSE_FINISH (sock);
|
||||||
else
|
else
|
||||||
{
|
|
||||||
if (hs->res < 0)
|
|
||||||
hs->rderrmsg = xstrdup (fd_errstr (sock));
|
|
||||||
CLOSE_INVALIDATE (sock);
|
CLOSE_INVALIDATE (sock);
|
||||||
}
|
|
||||||
|
|
||||||
if (!output_stream)
|
if (!output_stream)
|
||||||
fclose (fp);
|
fclose (fp);
|
||||||
if (hs->res == -2)
|
|
||||||
return FWRITEERR;
|
return err;
|
||||||
return RETRFINISHED;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The genuine HTTP loop! This is the part where the retrieval is
|
/* The genuine HTTP loop! This is the part where the retrieval is
|
||||||
@ -2626,6 +2916,12 @@ http_loop (struct url *u, struct url *original_url, char **newloc,
|
|||||||
char *file_name;
|
char *file_name;
|
||||||
bool force_full_retrieve = false;
|
bool force_full_retrieve = false;
|
||||||
|
|
||||||
|
|
||||||
|
/* If we are writing to a WARC file: always retrieve the whole file. */
|
||||||
|
if (opt.warc_filename != NULL)
|
||||||
|
force_full_retrieve = true;
|
||||||
|
|
||||||
|
|
||||||
/* Assert that no value for *LOCAL_FILE was passed. */
|
/* Assert that no value for *LOCAL_FILE was passed. */
|
||||||
assert (local_file == NULL || *local_file == NULL);
|
assert (local_file == NULL || *local_file == NULL);
|
||||||
|
|
||||||
@ -2795,6 +3091,18 @@ Spider mode enabled. Check if remote file exists.\n"));
|
|||||||
/* Fatal errors just return from the function. */
|
/* Fatal errors just return from the function. */
|
||||||
ret = err;
|
ret = err;
|
||||||
goto exit;
|
goto exit;
|
||||||
|
case WARC_ERR:
|
||||||
|
/* A fatal WARC error. */
|
||||||
|
logputs (LOG_VERBOSE, "\n");
|
||||||
|
logprintf (LOG_NOTQUIET, _("Cannot write to WARC file..\n"));
|
||||||
|
ret = err;
|
||||||
|
goto exit;
|
||||||
|
case WARC_TMP_FOPENERR: case WARC_TMP_FWRITEERR:
|
||||||
|
/* A fatal WARC error. */
|
||||||
|
logputs (LOG_VERBOSE, "\n");
|
||||||
|
logprintf (LOG_NOTQUIET, _("Cannot write to temporary WARC file.\n"));
|
||||||
|
ret = err;
|
||||||
|
goto exit;
|
||||||
case CONSSLERR:
|
case CONSSLERR:
|
||||||
/* Another fatal error. */
|
/* Another fatal error. */
|
||||||
logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n"));
|
logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n"));
|
||||||
|
40
src/init.c
40
src/init.c
@ -88,6 +88,7 @@ CMD_DECLARE (cmd_vector);
|
|||||||
|
|
||||||
CMD_DECLARE (cmd_spec_dirstruct);
|
CMD_DECLARE (cmd_spec_dirstruct);
|
||||||
CMD_DECLARE (cmd_spec_header);
|
CMD_DECLARE (cmd_spec_header);
|
||||||
|
CMD_DECLARE (cmd_spec_warc_header);
|
||||||
CMD_DECLARE (cmd_spec_htmlify);
|
CMD_DECLARE (cmd_spec_htmlify);
|
||||||
CMD_DECLARE (cmd_spec_mirror);
|
CMD_DECLARE (cmd_spec_mirror);
|
||||||
CMD_DECLARE (cmd_spec_prefer_family);
|
CMD_DECLARE (cmd_spec_prefer_family);
|
||||||
@ -264,6 +265,15 @@ static const struct {
|
|||||||
{ "verbose", NULL, cmd_spec_verbose },
|
{ "verbose", NULL, cmd_spec_verbose },
|
||||||
{ "wait", &opt.wait, cmd_time },
|
{ "wait", &opt.wait, cmd_time },
|
||||||
{ "waitretry", &opt.waitretry, cmd_time },
|
{ "waitretry", &opt.waitretry, cmd_time },
|
||||||
|
{ "warccdx", &opt.warc_cdx_enabled, cmd_boolean },
|
||||||
|
{ "warccdxdedup", &opt.warc_cdx_dedup_filename, cmd_file },
|
||||||
|
{ "warccompression", &opt.warc_compression_enabled, cmd_boolean },
|
||||||
|
{ "warcdigests", &opt.warc_digests_enabled, cmd_boolean },
|
||||||
|
{ "warcfile", &opt.warc_filename, cmd_file },
|
||||||
|
{ "warcheader", NULL, cmd_spec_warc_header },
|
||||||
|
{ "warckeeplog", &opt.warc_keep_log, cmd_boolean },
|
||||||
|
{ "warcmaxsize", &opt.warc_maxsize, cmd_bytes },
|
||||||
|
{ "warctempdir", &opt.warc_tempdir, cmd_directory },
|
||||||
#ifdef USE_WATT32
|
#ifdef USE_WATT32
|
||||||
{ "wdebug", &opt.wdebug, cmd_boolean },
|
{ "wdebug", &opt.wdebug, cmd_boolean },
|
||||||
#endif
|
#endif
|
||||||
@ -362,6 +372,14 @@ defaults (void)
|
|||||||
|
|
||||||
opt.useservertimestamps = true;
|
opt.useservertimestamps = true;
|
||||||
opt.show_all_dns_entries = false;
|
opt.show_all_dns_entries = false;
|
||||||
|
|
||||||
|
opt.warc_maxsize = 0; /* 1024 * 1024 * 1024; */
|
||||||
|
opt.warc_compression_enabled = true;
|
||||||
|
opt.warc_digests_enabled = true;
|
||||||
|
opt.warc_cdx_enabled = false;
|
||||||
|
opt.warc_cdx_dedup_filename = NULL;
|
||||||
|
opt.warc_tempdir = NULL;
|
||||||
|
opt.warc_keep_log = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Return the user's home directory (strdup-ed), or NULL if none is
|
/* Return the user's home directory (strdup-ed), or NULL if none is
|
||||||
@ -1235,6 +1253,27 @@ cmd_spec_header (const char *com, const char *val, void *place_ignored)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
cmd_spec_warc_header (const char *com, const char *val, void *place_ignored)
|
||||||
|
{
|
||||||
|
/* Empty value means reset the list of headers. */
|
||||||
|
if (*val == '\0')
|
||||||
|
{
|
||||||
|
free_vec (opt.warc_user_headers);
|
||||||
|
opt.warc_user_headers = NULL;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!check_user_specified_header (val))
|
||||||
|
{
|
||||||
|
fprintf (stderr, _("%s: %s: Invalid WARC header %s.\n"),
|
||||||
|
exec_name, com, quote (val));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
opt.warc_user_headers = vec_append (opt.warc_user_headers, val);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
cmd_spec_htmlify (const char *com, const char *val, void *place_ignored)
|
cmd_spec_htmlify (const char *com, const char *val, void *place_ignored)
|
||||||
{
|
{
|
||||||
@ -1639,6 +1678,7 @@ cleanup (void)
|
|||||||
xfree_null (opt.http_user);
|
xfree_null (opt.http_user);
|
||||||
xfree_null (opt.http_passwd);
|
xfree_null (opt.http_passwd);
|
||||||
free_vec (opt.user_headers);
|
free_vec (opt.user_headers);
|
||||||
|
free_vec (opt.warc_user_headers);
|
||||||
# ifdef HAVE_SSL
|
# ifdef HAVE_SSL
|
||||||
xfree_null (opt.cert_file);
|
xfree_null (opt.cert_file);
|
||||||
xfree_null (opt.private_key);
|
xfree_null (opt.private_key);
|
||||||
|
54
src/log.c
54
src/log.c
@ -79,6 +79,10 @@ as that of the covered work. */
|
|||||||
logging is inhibited, logfp is set back to NULL. */
|
logging is inhibited, logfp is set back to NULL. */
|
||||||
static FILE *logfp;
|
static FILE *logfp;
|
||||||
|
|
||||||
|
/* A second file descriptor pointing to the temporary log file for the
|
||||||
|
WARC writer. If WARC writing is disabled, this is NULL. */
|
||||||
|
static FILE *warclogfp;
|
||||||
|
|
||||||
/* If true, it means logging is inhibited, i.e. nothing is printed or
|
/* If true, it means logging is inhibited, i.e. nothing is printed or
|
||||||
stored. */
|
stored. */
|
||||||
static bool inhibit_logging;
|
static bool inhibit_logging;
|
||||||
@ -304,6 +308,31 @@ get_log_fp (void)
|
|||||||
return logfp;
|
return logfp;
|
||||||
return stderr;
|
return stderr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Returns the file descriptor for the secondary log file. This is
|
||||||
|
WARCLOGFP, except if called before log_init, in which case it
|
||||||
|
returns stderr. This is useful in case someone calls a logging
|
||||||
|
function before log_init.
|
||||||
|
|
||||||
|
If logging is inhibited, return NULL. */
|
||||||
|
|
||||||
|
static FILE *
|
||||||
|
get_warc_log_fp (void)
|
||||||
|
{
|
||||||
|
if (inhibit_logging)
|
||||||
|
return NULL;
|
||||||
|
if (warclogfp)
|
||||||
|
return warclogfp;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Sets the file descriptor for the secondary log file. */
|
||||||
|
|
||||||
|
void
|
||||||
|
log_set_warc_log_fp (FILE * fp)
|
||||||
|
{
|
||||||
|
warclogfp = fp;
|
||||||
|
}
|
||||||
|
|
||||||
/* Log a literal string S. The string is logged as-is, without a
|
/* Log a literal string S. The string is logged as-is, without a
|
||||||
newline appended. */
|
newline appended. */
|
||||||
@ -312,13 +341,17 @@ void
|
|||||||
logputs (enum log_options o, const char *s)
|
logputs (enum log_options o, const char *s)
|
||||||
{
|
{
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
|
FILE *warcfp;
|
||||||
|
|
||||||
check_redirect_output ();
|
check_redirect_output ();
|
||||||
if ((fp = get_log_fp ()) == NULL)
|
if ((fp = get_log_fp ()) == NULL)
|
||||||
return;
|
return;
|
||||||
|
warcfp = get_warc_log_fp ();
|
||||||
CHECK_VERBOSE (o);
|
CHECK_VERBOSE (o);
|
||||||
|
|
||||||
FPUTS (s, fp);
|
FPUTS (s, fp);
|
||||||
|
if (warcfp != NULL)
|
||||||
|
FPUTS (s, warcfp);
|
||||||
if (save_context_p)
|
if (save_context_p)
|
||||||
saved_append (s);
|
saved_append (s);
|
||||||
if (flush_log_p)
|
if (flush_log_p)
|
||||||
@ -356,8 +389,9 @@ log_vprintf_internal (struct logvprintf_state *state, const char *fmt,
|
|||||||
int available_size = sizeof (smallmsg);
|
int available_size = sizeof (smallmsg);
|
||||||
int numwritten;
|
int numwritten;
|
||||||
FILE *fp = get_log_fp ();
|
FILE *fp = get_log_fp ();
|
||||||
|
FILE *warcfp = get_warc_log_fp ();
|
||||||
|
|
||||||
if (!save_context_p)
|
if (!save_context_p && warcfp == NULL)
|
||||||
{
|
{
|
||||||
/* In the simple case just call vfprintf(), to avoid needless
|
/* In the simple case just call vfprintf(), to avoid needless
|
||||||
allocation and games with vsnprintf(). */
|
allocation and games with vsnprintf(). */
|
||||||
@ -407,8 +441,11 @@ log_vprintf_internal (struct logvprintf_state *state, const char *fmt,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Writing succeeded. */
|
/* Writing succeeded. */
|
||||||
|
if (save_context_p)
|
||||||
saved_append (write_ptr);
|
saved_append (write_ptr);
|
||||||
FPUTS (write_ptr, fp);
|
FPUTS (write_ptr, fp);
|
||||||
|
if (warcfp != NULL)
|
||||||
|
FPUTS (write_ptr, warcfp);
|
||||||
if (state->bigmsg)
|
if (state->bigmsg)
|
||||||
xfree (state->bigmsg);
|
xfree (state->bigmsg);
|
||||||
|
|
||||||
@ -426,6 +463,7 @@ void
|
|||||||
logflush (void)
|
logflush (void)
|
||||||
{
|
{
|
||||||
FILE *fp = get_log_fp ();
|
FILE *fp = get_log_fp ();
|
||||||
|
FILE *warcfp = get_warc_log_fp ();
|
||||||
if (fp)
|
if (fp)
|
||||||
{
|
{
|
||||||
/* 2005-10-25 SMS.
|
/* 2005-10-25 SMS.
|
||||||
@ -440,6 +478,10 @@ logflush (void)
|
|||||||
fflush (fp);
|
fflush (fp);
|
||||||
#endif /* def __VMS [else] */
|
#endif /* def __VMS [else] */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (warcfp != NULL)
|
||||||
|
fflush (warcfp);
|
||||||
|
|
||||||
needs_flushing = false;
|
needs_flushing = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -598,6 +640,7 @@ log_dump_context (void)
|
|||||||
{
|
{
|
||||||
int num = log_line_current;
|
int num = log_line_current;
|
||||||
FILE *fp = get_log_fp ();
|
FILE *fp = get_log_fp ();
|
||||||
|
FILE *warcfp = get_warc_log_fp ();
|
||||||
if (!fp)
|
if (!fp)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@ -609,14 +652,23 @@ log_dump_context (void)
|
|||||||
{
|
{
|
||||||
struct log_ln *ln = log_lines + num;
|
struct log_ln *ln = log_lines + num;
|
||||||
if (ln->content)
|
if (ln->content)
|
||||||
|
{
|
||||||
FPUTS (ln->content, fp);
|
FPUTS (ln->content, fp);
|
||||||
|
if (warcfp != NULL)
|
||||||
|
FPUTS (ln->content, warcfp);
|
||||||
|
}
|
||||||
ROT_ADVANCE (num);
|
ROT_ADVANCE (num);
|
||||||
}
|
}
|
||||||
while (num != log_line_current);
|
while (num != log_line_current);
|
||||||
if (trailing_line)
|
if (trailing_line)
|
||||||
if (log_lines[log_line_current].content)
|
if (log_lines[log_line_current].content)
|
||||||
|
{
|
||||||
FPUTS (log_lines[log_line_current].content, fp);
|
FPUTS (log_lines[log_line_current].content, fp);
|
||||||
|
if (warcfp != NULL)
|
||||||
|
FPUTS (log_lines[log_line_current].content, warcfp);
|
||||||
|
}
|
||||||
fflush (fp);
|
fflush (fp);
|
||||||
|
fflush (warcfp);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* String escape functions. */
|
/* String escape functions. */
|
||||||
|
@ -34,8 +34,12 @@ as that of the covered work. */
|
|||||||
/* The log file to which Wget writes to after HUP. */
|
/* The log file to which Wget writes to after HUP. */
|
||||||
#define DEFAULT_LOGFILE "wget-log"
|
#define DEFAULT_LOGFILE "wget-log"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
enum log_options { LOG_VERBOSE, LOG_NOTQUIET, LOG_NONVERBOSE, LOG_ALWAYS };
|
enum log_options { LOG_VERBOSE, LOG_NOTQUIET, LOG_NONVERBOSE, LOG_ALWAYS };
|
||||||
|
|
||||||
|
void log_set_warc_log_fp (FILE *);
|
||||||
|
|
||||||
void logprintf (enum log_options, const char *, ...)
|
void logprintf (enum log_options, const char *, ...)
|
||||||
GCC_FORMAT_ATTR (2, 3);
|
GCC_FORMAT_ATTR (2, 3);
|
||||||
void debug_logprintf (const char *, ...) GCC_FORMAT_ATTR (1, 2);
|
void debug_logprintf (const char *, ...) GCC_FORMAT_ATTR (1, 2);
|
||||||
|
100
src/main.c
100
src/main.c
@ -55,6 +55,7 @@ as that of the covered work. */
|
|||||||
#include "spider.h"
|
#include "spider.h"
|
||||||
#include "http.h" /* for save_cookies */
|
#include "http.h" /* for save_cookies */
|
||||||
#include "ptimer.h"
|
#include "ptimer.h"
|
||||||
|
#include "warc.h"
|
||||||
|
|
||||||
#include <getopt.h>
|
#include <getopt.h>
|
||||||
#include <getpass.h>
|
#include <getpass.h>
|
||||||
@ -287,6 +288,15 @@ static struct cmdline_option option_data[] =
|
|||||||
{ "version", 'V', OPT_FUNCALL, (void *) print_version, no_argument },
|
{ "version", 'V', OPT_FUNCALL, (void *) print_version, no_argument },
|
||||||
{ "wait", 'w', OPT_VALUE, "wait", -1 },
|
{ "wait", 'w', OPT_VALUE, "wait", -1 },
|
||||||
{ "waitretry", 0, OPT_VALUE, "waitretry", -1 },
|
{ "waitretry", 0, OPT_VALUE, "waitretry", -1 },
|
||||||
|
{ "warc-cdx", 0, OPT_BOOLEAN, "warccdx", -1 },
|
||||||
|
{ "warc-compression", 0, OPT_BOOLEAN, "warccompression", -1 },
|
||||||
|
{ "warc-dedup", 0, OPT_VALUE, "warccdxdedup", -1 },
|
||||||
|
{ "warc-digests", 0, OPT_BOOLEAN, "warcdigests", -1 },
|
||||||
|
{ "warc-file", 0, OPT_VALUE, "warcfile", -1 },
|
||||||
|
{ "warc-header", 0, OPT_VALUE, "warcheader", -1 },
|
||||||
|
{ "warc-keep-log", 0, OPT_BOOLEAN, "warckeeplog", -1 },
|
||||||
|
{ "warc-max-size", 0, OPT_VALUE, "warcmaxsize", -1 },
|
||||||
|
{ "warc-tempdir", 0, OPT_VALUE, "warctempdir", -1 },
|
||||||
#ifdef USE_WATT32
|
#ifdef USE_WATT32
|
||||||
{ "wdebug", 0, OPT_BOOLEAN, "wdebug", -1 },
|
{ "wdebug", 0, OPT_BOOLEAN, "wdebug", -1 },
|
||||||
#endif
|
#endif
|
||||||
@ -652,6 +662,29 @@ FTP options:\n"),
|
|||||||
--retr-symlinks when recursing, get linked-to files (not dir).\n"),
|
--retr-symlinks when recursing, get linked-to files (not dir).\n"),
|
||||||
"\n",
|
"\n",
|
||||||
|
|
||||||
|
N_("\
|
||||||
|
WARC options:\n"),
|
||||||
|
N_("\
|
||||||
|
--warc-file=FILENAME save request/response data to a .warc.gz file.\n"),
|
||||||
|
N_("\
|
||||||
|
--warc-header=STRING insert STRING into the warcinfo record.\n"),
|
||||||
|
N_("\
|
||||||
|
--warc-max-size=NUMBER set maximum size of WARC files to NUMBER.\n"),
|
||||||
|
N_("\
|
||||||
|
--warc-cdx write CDX index files.\n"),
|
||||||
|
N_("\
|
||||||
|
--warc-dedup=FILENAME do not store records listed in this CDX file.\n"),
|
||||||
|
N_("\
|
||||||
|
--no-warc-compression do not compress WARC files with GZIP.\n"),
|
||||||
|
N_("\
|
||||||
|
--no-warc-digests do not calculate SHA1 digests.\n"),
|
||||||
|
N_("\
|
||||||
|
--no-warc-keep-log do not store the log file in a WARC record.\n"),
|
||||||
|
N_("\
|
||||||
|
--warc-tempdir=DIRECTORY location for temporary files created by the\n\
|
||||||
|
WARC writer.\n"),
|
||||||
|
"\n",
|
||||||
|
|
||||||
N_("\
|
N_("\
|
||||||
Recursive download:\n"),
|
Recursive download:\n"),
|
||||||
N_("\
|
N_("\
|
||||||
@ -910,6 +943,7 @@ There is NO WARRANTY, to the extent permitted by law.\n"), stdout) < 0)
|
|||||||
}
|
}
|
||||||
|
|
||||||
char *program_name; /* Needed by lib/error.c. */
|
char *program_name; /* Needed by lib/error.c. */
|
||||||
|
char *program_argstring; /* Needed by wget_warc.c. */
|
||||||
|
|
||||||
int
|
int
|
||||||
main (int argc, char **argv)
|
main (int argc, char **argv)
|
||||||
@ -945,6 +979,22 @@ main (int argc, char **argv)
|
|||||||
windows_main ((char **) &exec_name);
|
windows_main ((char **) &exec_name);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* Construct the arguments string. */
|
||||||
|
int argstring_length = 1;
|
||||||
|
for (i = 1; i < argc; i++)
|
||||||
|
argstring_length += strlen (argv[i]) + 2 + 1;
|
||||||
|
char *p = program_argstring = malloc (argstring_length * sizeof (char));
|
||||||
|
for (i = 1; i < argc; i++)
|
||||||
|
{
|
||||||
|
*p++ = '"';
|
||||||
|
int arglen = strlen (argv[i]);
|
||||||
|
memcpy (p, argv[i], arglen);
|
||||||
|
p += arglen;
|
||||||
|
*p++ = '"';
|
||||||
|
*p++ = ' ';
|
||||||
|
}
|
||||||
|
*p = '\0';
|
||||||
|
|
||||||
/* Load the hard-coded defaults. */
|
/* Load the hard-coded defaults. */
|
||||||
defaults ();
|
defaults ();
|
||||||
|
|
||||||
@ -1194,6 +1244,47 @@ for details.\n\n"));
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (opt.warc_filename != 0)
|
||||||
|
{
|
||||||
|
if (opt.noclobber)
|
||||||
|
{
|
||||||
|
fprintf (stderr,
|
||||||
|
_("WARC output does not work with --no-clobber, "
|
||||||
|
"--no-clobber will be disabled.\n"));
|
||||||
|
opt.noclobber = false;
|
||||||
|
}
|
||||||
|
if (opt.timestamping)
|
||||||
|
{
|
||||||
|
fprintf (stderr,
|
||||||
|
_("WARC output does not work with timestamping, "
|
||||||
|
"timestamping will be disabled.\n"));
|
||||||
|
opt.timestamping = false;
|
||||||
|
}
|
||||||
|
if (opt.spider)
|
||||||
|
{
|
||||||
|
fprintf (stderr,
|
||||||
|
_("WARC output does not work with --spider.\n"));
|
||||||
|
exit (1);
|
||||||
|
}
|
||||||
|
if (opt.always_rest)
|
||||||
|
{
|
||||||
|
fprintf (stderr,
|
||||||
|
_("WARC output does not work with --continue, "
|
||||||
|
"--continue will be disabled.\n"));
|
||||||
|
opt.always_rest = false;
|
||||||
|
}
|
||||||
|
if (opt.warc_cdx_dedup_filename != 0 && !opt.warc_digests_enabled)
|
||||||
|
{
|
||||||
|
fprintf (stderr,
|
||||||
|
_("Digests are disabled; WARC deduplication will "
|
||||||
|
"not find duplicate records.\n"));
|
||||||
|
}
|
||||||
|
if (opt.warc_keep_log)
|
||||||
|
{
|
||||||
|
opt.progress_type = "dot";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (opt.ask_passwd && opt.passwd)
|
if (opt.ask_passwd && opt.passwd)
|
||||||
{
|
{
|
||||||
fprintf (stderr,
|
fprintf (stderr,
|
||||||
@ -1273,6 +1364,10 @@ for details.\n\n"));
|
|||||||
/* Initialize logging. */
|
/* Initialize logging. */
|
||||||
log_init (opt.lfilename, append_to_log);
|
log_init (opt.lfilename, append_to_log);
|
||||||
|
|
||||||
|
/* Open WARC file. */
|
||||||
|
if (opt.warc_filename != 0)
|
||||||
|
warc_init ();
|
||||||
|
|
||||||
DEBUGP (("DEBUG output created by Wget %s on %s.\n\n",
|
DEBUGP (("DEBUG output created by Wget %s on %s.\n\n",
|
||||||
version_string, OS_TYPE));
|
version_string, OS_TYPE));
|
||||||
|
|
||||||
@ -1472,7 +1567,12 @@ outputting to a regular file.\n"));
|
|||||||
if (opt.convert_links && !opt.delete_after)
|
if (opt.convert_links && !opt.delete_after)
|
||||||
convert_all_links ();
|
convert_all_links ();
|
||||||
|
|
||||||
|
/* Close WARC file. */
|
||||||
|
if (opt.warc_filename != 0)
|
||||||
|
warc_close ();
|
||||||
|
|
||||||
log_close ();
|
log_close ();
|
||||||
|
|
||||||
for (i = 0; i < nurl; i++)
|
for (i = 0; i < nurl; i++)
|
||||||
xfree (url[i]);
|
xfree (url[i]);
|
||||||
cleanup ();
|
cleanup ();
|
||||||
|
@ -87,6 +87,15 @@ struct options
|
|||||||
FTP. */
|
FTP. */
|
||||||
char *output_document; /* The output file to which the
|
char *output_document; /* The output file to which the
|
||||||
documents will be printed. */
|
documents will be printed. */
|
||||||
|
char *warc_filename; /* WARC output filename */
|
||||||
|
char *warc_tempdir; /* WARC temp dir */
|
||||||
|
char *warc_cdx_dedup_filename; /* CDX file to be used for deduplication. */
|
||||||
|
wgint warc_maxsize; /* WARC max archive size */
|
||||||
|
bool warc_compression_enabled; /* For GZIP compression. */
|
||||||
|
bool warc_digests_enabled; /* For SHA1 digests. */
|
||||||
|
bool warc_cdx_enabled; /* Create CDX files? */
|
||||||
|
bool warc_keep_log; /* Store the log file in a WARC record. */
|
||||||
|
char **warc_user_headers; /* User-defined WARC header(s). */
|
||||||
|
|
||||||
char *user; /* Generic username */
|
char *user; /* Generic username */
|
||||||
char *passwd; /* Generic password */
|
char *passwd; /* Generic password */
|
||||||
|
37
src/retr.c
37
src/retr.c
@ -139,13 +139,16 @@ limit_bandwidth (wgint bytes, struct ptimer *timer)
|
|||||||
|
|
||||||
/* Write data in BUF to OUT. However, if *SKIP is non-zero, skip that
|
/* Write data in BUF to OUT. However, if *SKIP is non-zero, skip that
|
||||||
amount of data and decrease SKIP. Increment *TOTAL by the amount
|
amount of data and decrease SKIP. Increment *TOTAL by the amount
|
||||||
of data written. */
|
of data written. If OUT2 is not NULL, also write BUF to OUT2.
|
||||||
|
In case of error writing to OUT, -1 is returned. In case of error
|
||||||
|
writing to OUT2, -2 is returned. In case of any other error,
|
||||||
|
1 is returned. */
|
||||||
|
|
||||||
static int
|
static int
|
||||||
write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
|
write_data (FILE *out, FILE *out2, const char *buf, int bufsize,
|
||||||
wgint *written)
|
wgint *skip, wgint *written)
|
||||||
{
|
{
|
||||||
if (!out)
|
if (out == NULL && out2 == NULL)
|
||||||
return 1;
|
return 1;
|
||||||
if (*skip > bufsize)
|
if (*skip > bufsize)
|
||||||
{
|
{
|
||||||
@ -161,7 +164,10 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (out != NULL)
|
||||||
fwrite (buf, 1, bufsize, out);
|
fwrite (buf, 1, bufsize, out);
|
||||||
|
if (out2 != NULL)
|
||||||
|
fwrite (buf, 1, bufsize, out2);
|
||||||
*written += bufsize;
|
*written += bufsize;
|
||||||
|
|
||||||
/* Immediately flush the downloaded data. This should not hinder
|
/* Immediately flush the downloaded data. This should not hinder
|
||||||
@ -178,9 +184,17 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
|
|||||||
actual justification. (Also, why 16K? Anyone test other values?)
|
actual justification. (Also, why 16K? Anyone test other values?)
|
||||||
*/
|
*/
|
||||||
#ifndef __VMS
|
#ifndef __VMS
|
||||||
|
if (out != NULL)
|
||||||
fflush (out);
|
fflush (out);
|
||||||
|
if (out2 != NULL)
|
||||||
|
fflush (out2);
|
||||||
#endif /* ndef __VMS */
|
#endif /* ndef __VMS */
|
||||||
return !ferror (out);
|
if (out != NULL && ferror (out))
|
||||||
|
return -1;
|
||||||
|
else if (out2 != NULL && ferror (out2))
|
||||||
|
return -2;
|
||||||
|
else
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Read the contents of file descriptor FD until it the connection
|
/* Read the contents of file descriptor FD until it the connection
|
||||||
@ -198,13 +212,17 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
|
|||||||
the amount of data written to disk. The time it took to download
|
the amount of data written to disk. The time it took to download
|
||||||
the data is stored to ELAPSED.
|
the data is stored to ELAPSED.
|
||||||
|
|
||||||
|
If OUT2 is non-NULL, the contents is also written to OUT2.
|
||||||
|
|
||||||
The function exits and returns the amount of data read. In case of
|
The function exits and returns the amount of data read. In case of
|
||||||
error while reading data, -1 is returned. In case of error while
|
error while reading data, -1 is returned. In case of error while
|
||||||
writing data, -2 is returned. */
|
writing data to OUT, -2 is returned. In case of error while writing
|
||||||
|
data to OUT2, -3 is returned. */
|
||||||
|
|
||||||
int
|
int
|
||||||
fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
|
fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
|
||||||
wgint *qtyread, wgint *qtywritten, double *elapsed, int flags)
|
wgint *qtyread, wgint *qtywritten, double *elapsed, int flags,
|
||||||
|
FILE *out2)
|
||||||
{
|
{
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
#undef max
|
#undef max
|
||||||
@ -343,9 +361,10 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
|
|||||||
if (ret > 0)
|
if (ret > 0)
|
||||||
{
|
{
|
||||||
sum_read += ret;
|
sum_read += ret;
|
||||||
if (!write_data (out, dlbuf, ret, &skip, &sum_written))
|
int write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written);
|
||||||
|
if (write_res != 0)
|
||||||
{
|
{
|
||||||
ret = -2;
|
ret = (write_res == -3) ? -3 : -2;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
if (chunked)
|
if (chunked)
|
||||||
|
@ -50,7 +50,7 @@ enum {
|
|||||||
rb_chunked_transfer_encoding = 4
|
rb_chunked_transfer_encoding = 4
|
||||||
};
|
};
|
||||||
|
|
||||||
int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int);
|
int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int, FILE *);
|
||||||
|
|
||||||
typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
|
typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
|
||||||
|
|
||||||
|
@ -46,6 +46,8 @@ const char *test_append_uri_pathel();
|
|||||||
const char *test_are_urls_equal();
|
const char *test_are_urls_equal();
|
||||||
const char *test_is_robots_txt_url();
|
const char *test_is_robots_txt_url();
|
||||||
|
|
||||||
|
const char *program_argstring = "TEST";
|
||||||
|
|
||||||
int tests_run;
|
int tests_run;
|
||||||
|
|
||||||
static const char *
|
static const char *
|
||||||
|
1332
src/warc.c
Normal file
1332
src/warc.c
Normal file
File diff suppressed because it is too large
Load Diff
19
src/warc.h
Normal file
19
src/warc.h
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
/* Declarations of WARC helper methods. */
|
||||||
|
#ifndef WARC_H
|
||||||
|
#define WARC_H
|
||||||
|
|
||||||
|
#include "host.h"
|
||||||
|
|
||||||
|
void warc_init ();
|
||||||
|
void warc_close ();
|
||||||
|
void warc_timestamp (char *timestamp);
|
||||||
|
void warc_uuid_str (char *id_str);
|
||||||
|
|
||||||
|
FILE * warc_tempfile ();
|
||||||
|
|
||||||
|
bool warc_write_request_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset);
|
||||||
|
bool warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location);
|
||||||
|
bool warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset);
|
||||||
|
|
||||||
|
#endif /* WARC_H */
|
||||||
|
|
@ -353,7 +353,9 @@ typedef enum
|
|||||||
PROXERR,
|
PROXERR,
|
||||||
/* 50 */
|
/* 50 */
|
||||||
AUTHFAILED, QUOTEXC, WRITEFAILED, SSLINITFAILED, VERIFCERTERR,
|
AUTHFAILED, QUOTEXC, WRITEFAILED, SSLINITFAILED, VERIFCERTERR,
|
||||||
UNLINKERR, NEWLOCATION_KEEP_POST
|
UNLINKERR, NEWLOCATION_KEEP_POST,
|
||||||
|
|
||||||
|
WARC_ERR, WARC_TMP_FOPENERR, WARC_TMP_FWRITEERR
|
||||||
} uerr_t;
|
} uerr_t;
|
||||||
|
|
||||||
/* 2005-02-19 SMS.
|
/* 2005-02-19 SMS.
|
||||||
|
Loading…
Reference in New Issue
Block a user