1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Squash embedded newlines in links.

This commit is contained in:
hniksic 2003-11-26 08:37:04 -08:00
parent 1b2dce0493
commit 3f84a5e00e
3 changed files with 21 additions and 12 deletions

View File

@ -1,3 +1,8 @@
2003-11-26 Hrvoje Niksic <hniksic@xemacs.org>
* html-parse.c (convert_and_copy): Remove embedded newlines when
AP_TRIM_BLANKS is specified.
2003-11-26 Hrvoje Niksic <hniksic@xemacs.org>
* ftp.c: Set con->csock to -1 where rbuf_uninitialize was

View File

@ -360,17 +360,16 @@ enum {
the ASCII range when copying the string.
* AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
of text. */
of text, as well as embedded newlines. */
static void
convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
{
int old_tail = pool->tail;
int size;
/* First, skip blanks if required. We must do this before entities
are processed, so that blanks can still be inserted as, for
instance, `&#32;'. */
/* Skip blanks if required. We must do this before entities are
processed, so that blanks can still be inserted as, for instance,
`&#32;'. */
if (flags & AP_TRIM_BLANKS)
{
while (beg < end && ISSPACE (*beg))
@ -378,7 +377,6 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
while (end > beg && ISSPACE (end[-1]))
--end;
}
size = end - beg;
if (flags & AP_DECODE_ENTITIES)
{
@ -391,15 +389,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
never lengthen it. */
const char *from = beg;
char *to;
int squash_newlines = flags & AP_TRIM_BLANKS;
POOL_GROW (pool, end - beg);
to = pool->contents + pool->tail;
while (from < end)
{
if (*from != '&')
*to++ = *from++;
else
if (*from == '&')
{
int entity = decode_entity (&from, end);
if (entity != -1)
@ -407,6 +404,10 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
else
*to++ = *from++;
}
else if ((*from == '\n' || *from == '\r') && squash_newlines)
++from;
else
*to++ = *from++;
}
/* Verify that we haven't exceeded the original size. (It
shouldn't happen, hence the assert.) */

View File

@ -612,9 +612,12 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
init_interesting ();
/* Specify MHT_TRIM_VALUES because of buggy HTML generators that
generate <a href=" foo"> instead of <a href="foo"> (Netscape
ignores spaces as well.) If you really mean space, use &32; or
%20. */
generate <a href=" foo"> instead of <a href="foo"> (browsers
ignore spaces as well.) If you really mean space, use &32; or
%20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
e.g. in <img src="foo.[newline]html">. Such newlines are also
ignored by IE and Mozilla and are presumably introduced by
writing HTML with editors that force word wrap. */
flags = MHT_TRIM_VALUES;
if (opt.strict_comments)
flags |= MHT_STRICT_COMMENTS;