diff --git a/src/ChangeLog b/src/ChangeLog index a0fabf77..6777851d 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,11 @@ +2005-07-02 Hrvoje Niksic + + * http.c (response_head_terminator): Minor optimization. + + * retr.c (fd_read_hunk): Call terminator with pointer to the start + of the data and the pointer to the current data. Changed all + callers. + 2005-07-01 Hrvoje Niksic * url.c (url_parse): Make sure u->params is not initialized for diff --git a/src/http.c b/src/http.c index 4d1c16eb..e281cd52 100644 --- a/src/http.c +++ b/src/http.c @@ -416,40 +416,51 @@ post_file (int sock, const char *file_name, wgint promised_size) return 0; } +/* Determine whether [START, PEEKED + PEEKLEN) contains an empty line. + If so, return the pointer to the position after the line, otherwise + return NULL. This is used as callback to fd_read_hunk. The data + between START and PEEKED has been read and cannot be "unread"; the + data after PEEKED has only been peeked. */ + static const char * -response_head_terminator (const char *hunk, int oldlen, int peeklen) +response_head_terminator (const char *start, const char *peeked, int peeklen) { - const char *start, *end; + const char *p, *end; /* If at first peek, verify whether HUNK starts with "HTTP". If not, this is a HTTP/0.9 request and we must bail out without reading anything. */ - if (oldlen == 0 && 0 != memcmp (hunk, "HTTP", MIN (peeklen, 4))) - return hunk; + if (start == peeked && 0 != memcmp (start, "HTTP", MIN (peeklen, 4))) + return start; - if (oldlen < 4) - start = hunk; - else - start = hunk + oldlen - 4; - end = hunk + oldlen + peeklen; + /* Look for "\n[\r]\n", and return the following position if found. + Start two chars before the current to cover the possibility that + part of the terminator (e.g. "\n\r") arrived in the previous + batch. */ + p = peeked - start < 2 ? start : peeked - 2; + end = peeked + peeklen; - for (; start < end - 1; start++) - if (*start == '\n') + /* Check for \n\r\n or \n\n anywhere in [p, end-2). */ + for (; p < end - 2; p++) + if (*p == '\n') { - if (start < end - 2 - && start[1] == '\r' - && start[2] == '\n') - return start + 3; - if (start[1] == '\n') - return start + 2; + if (p[1] == '\r' && p[2] == '\n') + return p + 3; + else if (p[1] == '\n') + return p + 2; } + /* p==end-2: check for \n\n directly preceding END. */ + if (p[0] == '\n' && p[1] == '\n') + return p + 2; + return NULL; } -/* The maximum size of a single HTTP response we care to read. This - is not meant to impose an arbitrary limit, but to protect the user - from Wget slurping up available memory upon encountering malicious - or buggy server output. Define it to 0 to remove the limit. */ +/* The maximum size of a single HTTP response we care to read. Rather + than being a limit of the reader implementation, this limit + prevents Wget from slurping all available memory upon encountering + malicious or buggy server output, thus protecting the user. Define + it to 0 to remove the limit. */ #define HTTP_RESPONSE_MAX_SIZE 65536 diff --git a/src/retr.c b/src/retr.c index 45061936..918fb5de 100644 --- a/src/retr.c +++ b/src/retr.c @@ -336,22 +336,35 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos, return ret; } -/* Read a hunk of data from FD, up until a terminator. The terminator - is whatever the TERMINATOR function determines it to be; for - example, it can be a line of data, or the head of an HTTP response. - The function returns the data read allocated with malloc. +/* Read a hunk of data from FD, up until a terminator. The hunk is + limited by whatever the TERMINATOR callback chooses as its + terminator. For example, if terminator stops at newline, the hunk + will consist of a line of data; if terminator stops at two + newlines, it can be used to read the head of an HTTP response. + Upon determining the boundary, the function returns the data (up to + the terminator) in malloc-allocated storage. - In case of error, NULL is returned. In case of EOF and no data - read, NULL is returned and errno set to 0. In case of EOF with - data having been read, the data is returned, but it will - (obviously) not contain the terminator. + In case of read error, NULL is returned. In case of EOF and no + data read, NULL is returned and errno set to 0. In case of having + read some data, but encountering EOF before seeing the terminator, + the data that has been read is returned, but it will (obviously) + not contain the terminator. + + The TERMINATOR function is called with three arguments: the + beginning of the data read so far, the beginning of the current + block of peeked-at data, and the length of the current block. + Depending on its needs, the function is free to choose whether to + analyze all data or just the newly arrived data. If TERMINATOR + returns NULL, it means that the terminator has not been seen. + Otherwise it should return a pointer to the charactre immediately + following the terminator. The idea is to be able to read a line of input, or otherwise a hunk of text, such as the head of an HTTP request, without crossing the boundary, so that the next call to fd_read etc. reads the data after the hunk. To achieve that, this function does the following: - 1. Peek at available data. + 1. Peek at incoming data. 2. Determine whether the peeked data, along with the previously read data, includes the terminator. @@ -396,12 +409,13 @@ fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize) xfree (hunk); return NULL; } - end = terminator (hunk, tail, pklen); + end = terminator (hunk, hunk + tail, pklen); if (end) { /* The data contains the terminator: we'll drain the data up to the end of the terminator. */ remain = end - (hunk + tail); + assert (remain >= 0); if (remain == 0) { /* No more data needs to be read. */ @@ -471,11 +485,11 @@ fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize) } static const char * -line_terminator (const char *hunk, int oldlen, int peeklen) +line_terminator (const char *start, const char *peeked, int peeklen) { - const char *p = memchr (hunk + oldlen, '\n', peeklen); + const char *p = memchr (peeked, '\n', peeklen); if (p) - /* p+1 because we want the line to include '\n' */ + /* p+1 because the line must include '\n' */ return p + 1; return NULL; } diff --git a/src/retr.h b/src/retr.h index 305ced4f..441471ac 100644 --- a/src/retr.h +++ b/src/retr.h @@ -45,7 +45,7 @@ enum { int fd_read_body (int, FILE *, wgint, wgint, wgint *, wgint *, double *, int); -typedef const char *(*hunk_terminator_t) (const char *, int, int); +typedef const char *(*hunk_terminator_t) (const char *, const char *, int); char *fd_read_hunk (int, hunk_terminator_t, long, long); char *fd_read_line (int);