diff --git a/src/ChangeLog b/src/ChangeLog index 24ae86fa..c6a4f660 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,8 @@ +2003-11-26 Hrvoje Niksic + + * html-parse.c (convert_and_copy): Remove embedded newlines when + AP_TRIM_BLANKS is specified. + 2003-11-26 Hrvoje Niksic * ftp.c: Set con->csock to -1 where rbuf_uninitialize was diff --git a/src/html-parse.c b/src/html-parse.c index 2a09ff09..4a86627e 100644 --- a/src/html-parse.c +++ b/src/html-parse.c @@ -360,17 +360,16 @@ enum { the ASCII range when copying the string. * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end - of text. */ + of text, as well as embedded newlines. */ static void convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags) { int old_tail = pool->tail; - int size; - /* First, skip blanks if required. We must do this before entities - are processed, so that blanks can still be inserted as, for - instance, ` '. */ + /* Skip blanks if required. We must do this before entities are + processed, so that blanks can still be inserted as, for instance, + ` '. */ if (flags & AP_TRIM_BLANKS) { while (beg < end && ISSPACE (*beg)) @@ -378,7 +377,6 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags while (end > beg && ISSPACE (end[-1])) --end; } - size = end - beg; if (flags & AP_DECODE_ENTITIES) { @@ -391,15 +389,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags never lengthen it. */ const char *from = beg; char *to; + int squash_newlines = flags & AP_TRIM_BLANKS; POOL_GROW (pool, end - beg); to = pool->contents + pool->tail; while (from < end) { - if (*from != '&') - *to++ = *from++; - else + if (*from == '&') { int entity = decode_entity (&from, end); if (entity != -1) @@ -407,6 +404,10 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags else *to++ = *from++; } + else if ((*from == '\n' || *from == '\r') && squash_newlines) + ++from; + else + *to++ = *from++; } /* Verify that we haven't exceeded the original size. (It shouldn't happen, hence the assert.) */ diff --git a/src/html-url.c b/src/html-url.c index 89b93539..59d873b3 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -612,9 +612,12 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow) init_interesting (); /* Specify MHT_TRIM_VALUES because of buggy HTML generators that - generate instead of (Netscape - ignores spaces as well.) If you really mean space, use &32; or - %20. */ + generate instead of (browsers + ignore spaces as well.) If you really mean space, use &32; or + %20. MHT_TRIM_VALUES also causes squashing of embedded newlines, + e.g. in . Such newlines are also + ignored by IE and Mozilla and are presumably introduced by + writing HTML with editors that force word wrap. */ flags = MHT_TRIM_VALUES; if (opt.strict_comments) flags |= MHT_STRICT_COMMENTS;