mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
warc: support large files.
This commit is contained in:
parent
408126aae0
commit
6a25955fe6
@ -1,3 +1,9 @@
|
|||||||
|
2012-02-01 Gijs van Tulder <gvtulder@gmail.com>
|
||||||
|
|
||||||
|
* warc.c: Fix large file support with ftello, fseeko.
|
||||||
|
* warc.h: Fix large file support.
|
||||||
|
* http.c: Fix large file support.
|
||||||
|
|
||||||
2012-02-23 Giuseppe Scrivano <giuseppe@southpole.se>
|
2012-02-23 Giuseppe Scrivano <giuseppe@southpole.se>
|
||||||
|
|
||||||
* main.c (main): Write diagnostic messages to `stderr' not to `stdout'.
|
* main.c (main): Write diagnostic messages to `stderr' not to `stdout'.
|
||||||
|
@ -1712,7 +1712,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
|||||||
char warc_timestamp_str [21];
|
char warc_timestamp_str [21];
|
||||||
char warc_request_uuid [48];
|
char warc_request_uuid [48];
|
||||||
ip_address *warc_ip = NULL;
|
ip_address *warc_ip = NULL;
|
||||||
long int warc_payload_offset = -1;
|
off_t warc_payload_offset = -1;
|
||||||
|
|
||||||
/* Whether this connection will be kept alive after the HTTP request
|
/* Whether this connection will be kept alive after the HTTP request
|
||||||
is done. */
|
is done. */
|
||||||
@ -2127,7 +2127,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
|||||||
if (write_error >= 0 && warc_tmp != NULL)
|
if (write_error >= 0 && warc_tmp != NULL)
|
||||||
{
|
{
|
||||||
/* Remember end of headers / start of payload. */
|
/* Remember end of headers / start of payload. */
|
||||||
warc_payload_offset = ftell (warc_tmp);
|
warc_payload_offset = ftello (warc_tmp);
|
||||||
|
|
||||||
/* Write a copy of the data to the WARC record. */
|
/* Write a copy of the data to the WARC record. */
|
||||||
int warc_tmp_written = fwrite (opt.post_data, 1, post_data_size, warc_tmp);
|
int warc_tmp_written = fwrite (opt.post_data, 1, post_data_size, warc_tmp);
|
||||||
@ -2139,7 +2139,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
|
|||||||
{
|
{
|
||||||
if (warc_tmp != NULL)
|
if (warc_tmp != NULL)
|
||||||
/* Remember end of headers / start of payload. */
|
/* Remember end of headers / start of payload. */
|
||||||
warc_payload_offset = ftell (warc_tmp);
|
warc_payload_offset = ftello (warc_tmp);
|
||||||
|
|
||||||
write_error = post_file (sock, opt.post_file_name, post_data_size, warc_tmp);
|
write_error = post_file (sock, opt.post_file_name, post_data_size, warc_tmp);
|
||||||
}
|
}
|
||||||
|
54
src/warc.c
54
src/warc.c
@ -78,10 +78,10 @@ static FILE *warc_current_file;
|
|||||||
static gzFile *warc_current_gzfile;
|
static gzFile *warc_current_gzfile;
|
||||||
|
|
||||||
/* The offset of the current gzip record in the WARC file. */
|
/* The offset of the current gzip record in the WARC file. */
|
||||||
static size_t warc_current_gzfile_offset;
|
static off_t warc_current_gzfile_offset;
|
||||||
|
|
||||||
/* The uncompressed size (so far) of the current record. */
|
/* The uncompressed size (so far) of the current record. */
|
||||||
static size_t warc_current_gzfile_uncompressed_size;
|
static off_t warc_current_gzfile_uncompressed_size;
|
||||||
# endif
|
# endif
|
||||||
|
|
||||||
/* This is true until a warc_write_* method fails. */
|
/* This is true until a warc_write_* method fails. */
|
||||||
@ -186,7 +186,7 @@ warc_write_start_record ()
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
fflush (warc_current_file);
|
fflush (warc_current_file);
|
||||||
if (opt.warc_maxsize > 0 && ftell (warc_current_file) >= opt.warc_maxsize)
|
if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
|
||||||
warc_start_new_file (false);
|
warc_start_new_file (false);
|
||||||
|
|
||||||
#ifdef HAVE_LIBZ
|
#ifdef HAVE_LIBZ
|
||||||
@ -194,7 +194,7 @@ warc_write_start_record ()
|
|||||||
if (opt.warc_compression_enabled)
|
if (opt.warc_compression_enabled)
|
||||||
{
|
{
|
||||||
/* Record the starting offset of the new record. */
|
/* Record the starting offset of the new record. */
|
||||||
warc_current_gzfile_offset = ftell (warc_current_file);
|
warc_current_gzfile_offset = ftello (warc_current_file);
|
||||||
|
|
||||||
/* Reserve space for the extra GZIP header field.
|
/* Reserve space for the extra GZIP header field.
|
||||||
In warc_write_end_record we will fill this space
|
In warc_write_end_record we will fill this space
|
||||||
@ -245,8 +245,8 @@ warc_write_block_from_file (FILE *data_in)
|
|||||||
{
|
{
|
||||||
/* Add the Content-Length header. */
|
/* Add the Content-Length header. */
|
||||||
char *content_length;
|
char *content_length;
|
||||||
fseek (data_in, 0L, SEEK_END);
|
fseeko (data_in, 0L, SEEK_END);
|
||||||
if (! asprintf (&content_length, "%ld", ftell (data_in)))
|
if (! asprintf (&content_length, "%ld", ftello (data_in)))
|
||||||
{
|
{
|
||||||
warc_write_ok = false;
|
warc_write_ok = false;
|
||||||
return false;
|
return false;
|
||||||
@ -257,7 +257,7 @@ warc_write_block_from_file (FILE *data_in)
|
|||||||
/* End of the WARC header section. */
|
/* End of the WARC header section. */
|
||||||
warc_write_string ("\r\n");
|
warc_write_string ("\r\n");
|
||||||
|
|
||||||
if (fseek (data_in, 0L, SEEK_SET) != 0)
|
if (fseeko (data_in, 0L, SEEK_SET) != 0)
|
||||||
warc_write_ok = false;
|
warc_write_ok = false;
|
||||||
|
|
||||||
/* Copy the data in the file to the WARC record. */
|
/* Copy the data in the file to the WARC record. */
|
||||||
@ -294,7 +294,7 @@ warc_write_end_record ()
|
|||||||
}
|
}
|
||||||
|
|
||||||
fflush (warc_current_file);
|
fflush (warc_current_file);
|
||||||
fseek (warc_current_file, 0, SEEK_END);
|
fseeko (warc_current_file, 0, SEEK_END);
|
||||||
|
|
||||||
/* The WARC standard suggests that we add 'skip length' data in the
|
/* The WARC standard suggests that we add 'skip length' data in the
|
||||||
extra header field of the GZIP stream.
|
extra header field of the GZIP stream.
|
||||||
@ -312,12 +312,12 @@ warc_write_end_record ()
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/* Calculate the uncompressed and compressed sizes. */
|
/* Calculate the uncompressed and compressed sizes. */
|
||||||
size_t current_offset = ftell (warc_current_file);
|
off_t current_offset = ftello (warc_current_file);
|
||||||
size_t uncompressed_size = current_offset - warc_current_gzfile_offset;
|
off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
|
||||||
size_t compressed_size = warc_current_gzfile_uncompressed_size;
|
off_t compressed_size = warc_current_gzfile_uncompressed_size;
|
||||||
|
|
||||||
/* Go back to the static GZIP header. */
|
/* Go back to the static GZIP header. */
|
||||||
fseek (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
|
fseeko (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
|
||||||
|
|
||||||
/* Read the header. */
|
/* Read the header. */
|
||||||
char static_header[GZIP_STATIC_HEADER_SIZE];
|
char static_header[GZIP_STATIC_HEADER_SIZE];
|
||||||
@ -332,7 +332,7 @@ warc_write_end_record ()
|
|||||||
static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
|
static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
|
||||||
|
|
||||||
/* Write the header back to the file, but starting at warc_current_gzfile_offset. */
|
/* Write the header back to the file, but starting at warc_current_gzfile_offset. */
|
||||||
fseek (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
|
fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
|
||||||
fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
|
fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
|
||||||
|
|
||||||
/* Prepare the extra GZIP header. */
|
/* Prepare the extra GZIP header. */
|
||||||
@ -355,12 +355,12 @@ warc_write_end_record ()
|
|||||||
extra_header[11] = (compressed_size >> 24) & 255;
|
extra_header[11] = (compressed_size >> 24) & 255;
|
||||||
|
|
||||||
/* Write the extra header after the static header. */
|
/* Write the extra header after the static header. */
|
||||||
fseek (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
|
fseeko (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
|
||||||
fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
|
fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
|
||||||
|
|
||||||
/* Done, move back to the end of the file. */
|
/* Done, move back to the end of the file. */
|
||||||
fflush (warc_current_file);
|
fflush (warc_current_file);
|
||||||
fseek (warc_current_file, 0, SEEK_END);
|
fseeko (warc_current_file, 0, SEEK_END);
|
||||||
}
|
}
|
||||||
#endif /* HAVE_LIBZ */
|
#endif /* HAVE_LIBZ */
|
||||||
|
|
||||||
@ -408,14 +408,14 @@ warc_write_ip_header (ip_address *ip)
|
|||||||
the end of the file. The digest number will be written into the
|
the end of the file. The digest number will be written into the
|
||||||
16 bytes beginning ad RES_PAYLOAD. */
|
16 bytes beginning ad RES_PAYLOAD. */
|
||||||
static int
|
static int
|
||||||
warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, long int payload_offset)
|
warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, off_t payload_offset)
|
||||||
{
|
{
|
||||||
#define BLOCKSIZE 32768
|
#define BLOCKSIZE 32768
|
||||||
|
|
||||||
struct sha1_ctx ctx_block;
|
struct sha1_ctx ctx_block;
|
||||||
struct sha1_ctx ctx_payload;
|
struct sha1_ctx ctx_payload;
|
||||||
long int pos;
|
off_t pos;
|
||||||
size_t sum;
|
off_t sum;
|
||||||
|
|
||||||
char *buffer = malloc (BLOCKSIZE + 72);
|
char *buffer = malloc (BLOCKSIZE + 72);
|
||||||
if (!buffer)
|
if (!buffer)
|
||||||
@ -434,7 +434,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
|
|||||||
/* We read the file in blocks of BLOCKSIZE bytes. One call of the
|
/* We read the file in blocks of BLOCKSIZE bytes. One call of the
|
||||||
computation function processes the whole buffer so that with the
|
computation function processes the whole buffer so that with the
|
||||||
next round of the loop another block can be read. */
|
next round of the loop another block can be read. */
|
||||||
size_t n;
|
off_t n;
|
||||||
sum = 0;
|
sum = 0;
|
||||||
|
|
||||||
/* Read block. Take care for partial reads. */
|
/* Read block. Take care for partial reads. */
|
||||||
@ -475,7 +475,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
|
|||||||
if (payload_offset >= 0 && payload_offset < pos)
|
if (payload_offset >= 0 && payload_offset < pos)
|
||||||
{
|
{
|
||||||
/* At least part of the buffer contains data from payload. */
|
/* At least part of the buffer contains data from payload. */
|
||||||
int start_of_payload = payload_offset - (pos - BLOCKSIZE);
|
off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
|
||||||
if (start_of_payload <= 0)
|
if (start_of_payload <= 0)
|
||||||
/* All bytes in the buffer belong to the payload. */
|
/* All bytes in the buffer belong to the payload. */
|
||||||
start_of_payload = 0;
|
start_of_payload = 0;
|
||||||
@ -499,7 +499,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
|
|||||||
if (payload_offset >= 0 && payload_offset < pos)
|
if (payload_offset >= 0 && payload_offset < pos)
|
||||||
{
|
{
|
||||||
/* At least part of the buffer contains data from payload. */
|
/* At least part of the buffer contains data from payload. */
|
||||||
int start_of_payload = payload_offset - (pos - sum);
|
off_t start_of_payload = payload_offset - (pos - sum);
|
||||||
if (start_of_payload <= 0)
|
if (start_of_payload <= 0)
|
||||||
/* All bytes in the buffer belong to the payload. */
|
/* All bytes in the buffer belong to the payload. */
|
||||||
start_of_payload = 0;
|
start_of_payload = 0;
|
||||||
@ -1134,7 +1134,7 @@ warc_tempfile ()
|
|||||||
Calling this function will close body.
|
Calling this function will close body.
|
||||||
Returns true on success, false on error. */
|
Returns true on success, false on error. */
|
||||||
bool
|
bool
|
||||||
warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, long int payload_offset)
|
warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, off_t payload_offset)
|
||||||
{
|
{
|
||||||
warc_write_start_record ();
|
warc_write_start_record ();
|
||||||
warc_write_header ("WARC-Type", "request");
|
warc_write_header ("WARC-Type", "request");
|
||||||
@ -1166,7 +1166,7 @@ warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip
|
|||||||
response_uuid is the uuid of the response.
|
response_uuid is the uuid of the response.
|
||||||
Returns true on success, false on error. */
|
Returns true on success, false on error. */
|
||||||
static bool
|
static bool
|
||||||
warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, size_t offset, char *warc_filename, char *response_uuid)
|
warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, off_t offset, char *warc_filename, char *response_uuid)
|
||||||
{
|
{
|
||||||
/* Transform the timestamp. */
|
/* Transform the timestamp. */
|
||||||
char timestamp_str_cdx [15];
|
char timestamp_str_cdx [15];
|
||||||
@ -1258,7 +1258,7 @@ warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_u
|
|||||||
Calling this function will close body.
|
Calling this function will close body.
|
||||||
Returns true on success, false on error. */
|
Returns true on success, false on error. */
|
||||||
bool
|
bool
|
||||||
warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location)
|
warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset, char *mime_type, int response_code, char *redirect_location)
|
||||||
{
|
{
|
||||||
char *block_digest = NULL;
|
char *block_digest = NULL;
|
||||||
char *payload_digest = NULL;
|
char *payload_digest = NULL;
|
||||||
@ -1304,8 +1304,8 @@ warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_
|
|||||||
char response_uuid [48];
|
char response_uuid [48];
|
||||||
warc_uuid_str (response_uuid);
|
warc_uuid_str (response_uuid);
|
||||||
|
|
||||||
fseek (warc_current_file, 0L, SEEK_END);
|
fseeko (warc_current_file, 0L, SEEK_END);
|
||||||
size_t offset = ftell (warc_current_file);
|
off_t offset = ftello (warc_current_file);
|
||||||
|
|
||||||
warc_write_start_record ();
|
warc_write_start_record ();
|
||||||
warc_write_header ("WARC-Type", "response");
|
warc_write_header ("WARC-Type", "response");
|
||||||
@ -1349,7 +1349,7 @@ warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_
|
|||||||
Calling this function will close body.
|
Calling this function will close body.
|
||||||
Returns true on success, false on error. */
|
Returns true on success, false on error. */
|
||||||
bool
|
bool
|
||||||
warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset)
|
warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, off_t payload_offset)
|
||||||
{
|
{
|
||||||
if (resource_uuid == NULL)
|
if (resource_uuid == NULL)
|
||||||
{
|
{
|
||||||
|
@ -11,9 +11,9 @@ void warc_uuid_str (char *id_str);
|
|||||||
|
|
||||||
FILE * warc_tempfile ();
|
FILE * warc_tempfile ();
|
||||||
|
|
||||||
bool warc_write_request_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset);
|
bool warc_write_request_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset);
|
||||||
bool warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location);
|
bool warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset, char *mime_type, int response_code, char *redirect_location);
|
||||||
bool warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset);
|
bool warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, off_t payload_offset);
|
||||||
|
|
||||||
#endif /* WARC_H */
|
#endif /* WARC_H */
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user