1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

warc: support large files.

This commit is contained in:
Gijs van Tulder 2012-02-25 11:58:21 +01:00 committed by Giuseppe Scrivano
parent 408126aae0
commit 6a25955fe6
4 changed files with 39 additions and 33 deletions

View File

@ -1,3 +1,9 @@
2012-02-01 Gijs van Tulder <gvtulder@gmail.com>
* warc.c: Fix large file support with ftello, fseeko.
* warc.h: Fix large file support.
* http.c: Fix large file support.
2012-02-23 Giuseppe Scrivano <giuseppe@southpole.se> 2012-02-23 Giuseppe Scrivano <giuseppe@southpole.se>
* main.c (main): Write diagnostic messages to `stderr' not to `stdout'. * main.c (main): Write diagnostic messages to `stderr' not to `stdout'.

View File

@ -1712,7 +1712,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
char warc_timestamp_str [21]; char warc_timestamp_str [21];
char warc_request_uuid [48]; char warc_request_uuid [48];
ip_address *warc_ip = NULL; ip_address *warc_ip = NULL;
long int warc_payload_offset = -1; off_t warc_payload_offset = -1;
/* Whether this connection will be kept alive after the HTTP request /* Whether this connection will be kept alive after the HTTP request
is done. */ is done. */
@ -2127,7 +2127,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
if (write_error >= 0 && warc_tmp != NULL) if (write_error >= 0 && warc_tmp != NULL)
{ {
/* Remember end of headers / start of payload. */ /* Remember end of headers / start of payload. */
warc_payload_offset = ftell (warc_tmp); warc_payload_offset = ftello (warc_tmp);
/* Write a copy of the data to the WARC record. */ /* Write a copy of the data to the WARC record. */
int warc_tmp_written = fwrite (opt.post_data, 1, post_data_size, warc_tmp); int warc_tmp_written = fwrite (opt.post_data, 1, post_data_size, warc_tmp);
@ -2139,7 +2139,7 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
{ {
if (warc_tmp != NULL) if (warc_tmp != NULL)
/* Remember end of headers / start of payload. */ /* Remember end of headers / start of payload. */
warc_payload_offset = ftell (warc_tmp); warc_payload_offset = ftello (warc_tmp);
write_error = post_file (sock, opt.post_file_name, post_data_size, warc_tmp); write_error = post_file (sock, opt.post_file_name, post_data_size, warc_tmp);
} }

View File

@ -78,10 +78,10 @@ static FILE *warc_current_file;
static gzFile *warc_current_gzfile; static gzFile *warc_current_gzfile;
/* The offset of the current gzip record in the WARC file. */ /* The offset of the current gzip record in the WARC file. */
static size_t warc_current_gzfile_offset; static off_t warc_current_gzfile_offset;
/* The uncompressed size (so far) of the current record. */ /* The uncompressed size (so far) of the current record. */
static size_t warc_current_gzfile_uncompressed_size; static off_t warc_current_gzfile_uncompressed_size;
# endif # endif
/* This is true until a warc_write_* method fails. */ /* This is true until a warc_write_* method fails. */
@ -186,7 +186,7 @@ warc_write_start_record ()
return false; return false;
fflush (warc_current_file); fflush (warc_current_file);
if (opt.warc_maxsize > 0 && ftell (warc_current_file) >= opt.warc_maxsize) if (opt.warc_maxsize > 0 && ftello (warc_current_file) >= opt.warc_maxsize)
warc_start_new_file (false); warc_start_new_file (false);
#ifdef HAVE_LIBZ #ifdef HAVE_LIBZ
@ -194,7 +194,7 @@ warc_write_start_record ()
if (opt.warc_compression_enabled) if (opt.warc_compression_enabled)
{ {
/* Record the starting offset of the new record. */ /* Record the starting offset of the new record. */
warc_current_gzfile_offset = ftell (warc_current_file); warc_current_gzfile_offset = ftello (warc_current_file);
/* Reserve space for the extra GZIP header field. /* Reserve space for the extra GZIP header field.
In warc_write_end_record we will fill this space In warc_write_end_record we will fill this space
@ -245,8 +245,8 @@ warc_write_block_from_file (FILE *data_in)
{ {
/* Add the Content-Length header. */ /* Add the Content-Length header. */
char *content_length; char *content_length;
fseek (data_in, 0L, SEEK_END); fseeko (data_in, 0L, SEEK_END);
if (! asprintf (&content_length, "%ld", ftell (data_in))) if (! asprintf (&content_length, "%ld", ftello (data_in)))
{ {
warc_write_ok = false; warc_write_ok = false;
return false; return false;
@ -257,7 +257,7 @@ warc_write_block_from_file (FILE *data_in)
/* End of the WARC header section. */ /* End of the WARC header section. */
warc_write_string ("\r\n"); warc_write_string ("\r\n");
if (fseek (data_in, 0L, SEEK_SET) != 0) if (fseeko (data_in, 0L, SEEK_SET) != 0)
warc_write_ok = false; warc_write_ok = false;
/* Copy the data in the file to the WARC record. */ /* Copy the data in the file to the WARC record. */
@ -294,7 +294,7 @@ warc_write_end_record ()
} }
fflush (warc_current_file); fflush (warc_current_file);
fseek (warc_current_file, 0, SEEK_END); fseeko (warc_current_file, 0, SEEK_END);
/* The WARC standard suggests that we add 'skip length' data in the /* The WARC standard suggests that we add 'skip length' data in the
extra header field of the GZIP stream. extra header field of the GZIP stream.
@ -312,12 +312,12 @@ warc_write_end_record ()
*/ */
/* Calculate the uncompressed and compressed sizes. */ /* Calculate the uncompressed and compressed sizes. */
size_t current_offset = ftell (warc_current_file); off_t current_offset = ftello (warc_current_file);
size_t uncompressed_size = current_offset - warc_current_gzfile_offset; off_t uncompressed_size = current_offset - warc_current_gzfile_offset;
size_t compressed_size = warc_current_gzfile_uncompressed_size; off_t compressed_size = warc_current_gzfile_uncompressed_size;
/* Go back to the static GZIP header. */ /* Go back to the static GZIP header. */
fseek (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET); fseeko (warc_current_file, warc_current_gzfile_offset + EXTRA_GZIP_HEADER_SIZE, SEEK_SET);
/* Read the header. */ /* Read the header. */
char static_header[GZIP_STATIC_HEADER_SIZE]; char static_header[GZIP_STATIC_HEADER_SIZE];
@ -332,7 +332,7 @@ warc_write_end_record ()
static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA; static_header[OFF_FLG] = static_header[OFF_FLG] | FLG_FEXTRA;
/* Write the header back to the file, but starting at warc_current_gzfile_offset. */ /* Write the header back to the file, but starting at warc_current_gzfile_offset. */
fseek (warc_current_file, warc_current_gzfile_offset, SEEK_SET); fseeko (warc_current_file, warc_current_gzfile_offset, SEEK_SET);
fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file); fwrite (static_header, 1, GZIP_STATIC_HEADER_SIZE, warc_current_file);
/* Prepare the extra GZIP header. */ /* Prepare the extra GZIP header. */
@ -355,12 +355,12 @@ warc_write_end_record ()
extra_header[11] = (compressed_size >> 24) & 255; extra_header[11] = (compressed_size >> 24) & 255;
/* Write the extra header after the static header. */ /* Write the extra header after the static header. */
fseek (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET); fseeko (warc_current_file, warc_current_gzfile_offset + GZIP_STATIC_HEADER_SIZE, SEEK_SET);
fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file); fwrite (extra_header, 1, EXTRA_GZIP_HEADER_SIZE, warc_current_file);
/* Done, move back to the end of the file. */ /* Done, move back to the end of the file. */
fflush (warc_current_file); fflush (warc_current_file);
fseek (warc_current_file, 0, SEEK_END); fseeko (warc_current_file, 0, SEEK_END);
} }
#endif /* HAVE_LIBZ */ #endif /* HAVE_LIBZ */
@ -408,14 +408,14 @@ warc_write_ip_header (ip_address *ip)
the end of the file. The digest number will be written into the the end of the file. The digest number will be written into the
16 bytes beginning ad RES_PAYLOAD. */ 16 bytes beginning ad RES_PAYLOAD. */
static int static int
warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, long int payload_offset) warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload, off_t payload_offset)
{ {
#define BLOCKSIZE 32768 #define BLOCKSIZE 32768
struct sha1_ctx ctx_block; struct sha1_ctx ctx_block;
struct sha1_ctx ctx_payload; struct sha1_ctx ctx_payload;
long int pos; off_t pos;
size_t sum; off_t sum;
char *buffer = malloc (BLOCKSIZE + 72); char *buffer = malloc (BLOCKSIZE + 72);
if (!buffer) if (!buffer)
@ -434,7 +434,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
/* We read the file in blocks of BLOCKSIZE bytes. One call of the /* We read the file in blocks of BLOCKSIZE bytes. One call of the
computation function processes the whole buffer so that with the computation function processes the whole buffer so that with the
next round of the loop another block can be read. */ next round of the loop another block can be read. */
size_t n; off_t n;
sum = 0; sum = 0;
/* Read block. Take care for partial reads. */ /* Read block. Take care for partial reads. */
@ -475,7 +475,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
if (payload_offset >= 0 && payload_offset < pos) if (payload_offset >= 0 && payload_offset < pos)
{ {
/* At least part of the buffer contains data from payload. */ /* At least part of the buffer contains data from payload. */
int start_of_payload = payload_offset - (pos - BLOCKSIZE); off_t start_of_payload = payload_offset - (pos - BLOCKSIZE);
if (start_of_payload <= 0) if (start_of_payload <= 0)
/* All bytes in the buffer belong to the payload. */ /* All bytes in the buffer belong to the payload. */
start_of_payload = 0; start_of_payload = 0;
@ -499,7 +499,7 @@ warc_sha1_stream_with_payload (FILE *stream, void *res_block, void *res_payload,
if (payload_offset >= 0 && payload_offset < pos) if (payload_offset >= 0 && payload_offset < pos)
{ {
/* At least part of the buffer contains data from payload. */ /* At least part of the buffer contains data from payload. */
int start_of_payload = payload_offset - (pos - sum); off_t start_of_payload = payload_offset - (pos - sum);
if (start_of_payload <= 0) if (start_of_payload <= 0)
/* All bytes in the buffer belong to the payload. */ /* All bytes in the buffer belong to the payload. */
start_of_payload = 0; start_of_payload = 0;
@ -1134,7 +1134,7 @@ warc_tempfile ()
Calling this function will close body. Calling this function will close body.
Returns true on success, false on error. */ Returns true on success, false on error. */
bool bool
warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, long int payload_offset) warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip_address *ip, FILE *body, off_t payload_offset)
{ {
warc_write_start_record (); warc_write_start_record ();
warc_write_header ("WARC-Type", "request"); warc_write_header ("WARC-Type", "request");
@ -1166,7 +1166,7 @@ warc_write_request_record (char *url, char *timestamp_str, char *record_uuid, ip
response_uuid is the uuid of the response. response_uuid is the uuid of the response.
Returns true on success, false on error. */ Returns true on success, false on error. */
static bool static bool
warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, size_t offset, char *warc_filename, char *response_uuid) warc_write_cdx_record (char *url, char *timestamp_str, char *mime_type, int response_code, char *payload_digest, char *redirect_location, off_t offset, char *warc_filename, char *response_uuid)
{ {
/* Transform the timestamp. */ /* Transform the timestamp. */
char timestamp_str_cdx [15]; char timestamp_str_cdx [15];
@ -1258,7 +1258,7 @@ warc_write_revisit_record (char *url, char *timestamp_str, char *concurrent_to_u
Calling this function will close body. Calling this function will close body.
Returns true on success, false on error. */ Returns true on success, false on error. */
bool bool
warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location) warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset, char *mime_type, int response_code, char *redirect_location)
{ {
char *block_digest = NULL; char *block_digest = NULL;
char *payload_digest = NULL; char *payload_digest = NULL;
@ -1304,8 +1304,8 @@ warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_
char response_uuid [48]; char response_uuid [48];
warc_uuid_str (response_uuid); warc_uuid_str (response_uuid);
fseek (warc_current_file, 0L, SEEK_END); fseeko (warc_current_file, 0L, SEEK_END);
size_t offset = ftell (warc_current_file); off_t offset = ftello (warc_current_file);
warc_write_start_record (); warc_write_start_record ();
warc_write_header ("WARC-Type", "response"); warc_write_header ("WARC-Type", "response");
@ -1349,7 +1349,7 @@ warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_
Calling this function will close body. Calling this function will close body.
Returns true on success, false on error. */ Returns true on success, false on error. */
bool bool
warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset) warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, off_t payload_offset)
{ {
if (resource_uuid == NULL) if (resource_uuid == NULL)
{ {

View File

@ -11,9 +11,9 @@ void warc_uuid_str (char *id_str);
FILE * warc_tempfile (); FILE * warc_tempfile ();
bool warc_write_request_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset); bool warc_write_request_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset);
bool warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, long int payload_offset, char *mime_type, int response_code, char *redirect_location); bool warc_write_response_record (char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, FILE *body, off_t payload_offset, char *mime_type, int response_code, char *redirect_location);
bool warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, long int payload_offset); bool warc_write_resource_record (char *resource_uuid, char *url, char *timestamp_str, char *concurrent_to_uuid, ip_address *ip, char *content_type, FILE *body, off_t payload_offset);
#endif /* WARC_H */ #endif /* WARC_H */