mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Squash embedded newlines in links.
This commit is contained in:
parent
1b2dce0493
commit
3f84a5e00e
@ -1,3 +1,8 @@
|
||||
2003-11-26 Hrvoje Niksic <hniksic@xemacs.org>
|
||||
|
||||
* html-parse.c (convert_and_copy): Remove embedded newlines when
|
||||
AP_TRIM_BLANKS is specified.
|
||||
|
||||
2003-11-26 Hrvoje Niksic <hniksic@xemacs.org>
|
||||
|
||||
* ftp.c: Set con->csock to -1 where rbuf_uninitialize was
|
||||
|
@ -360,17 +360,16 @@ enum {
|
||||
the ASCII range when copying the string.
|
||||
|
||||
* AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
|
||||
of text. */
|
||||
of text, as well as embedded newlines. */
|
||||
|
||||
static void
|
||||
convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
|
||||
{
|
||||
int old_tail = pool->tail;
|
||||
int size;
|
||||
|
||||
/* First, skip blanks if required. We must do this before entities
|
||||
are processed, so that blanks can still be inserted as, for
|
||||
instance, ` '. */
|
||||
/* Skip blanks if required. We must do this before entities are
|
||||
processed, so that blanks can still be inserted as, for instance,
|
||||
` '. */
|
||||
if (flags & AP_TRIM_BLANKS)
|
||||
{
|
||||
while (beg < end && ISSPACE (*beg))
|
||||
@ -378,7 +377,6 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
|
||||
while (end > beg && ISSPACE (end[-1]))
|
||||
--end;
|
||||
}
|
||||
size = end - beg;
|
||||
|
||||
if (flags & AP_DECODE_ENTITIES)
|
||||
{
|
||||
@ -391,15 +389,14 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
|
||||
never lengthen it. */
|
||||
const char *from = beg;
|
||||
char *to;
|
||||
int squash_newlines = flags & AP_TRIM_BLANKS;
|
||||
|
||||
POOL_GROW (pool, end - beg);
|
||||
to = pool->contents + pool->tail;
|
||||
|
||||
while (from < end)
|
||||
{
|
||||
if (*from != '&')
|
||||
*to++ = *from++;
|
||||
else
|
||||
if (*from == '&')
|
||||
{
|
||||
int entity = decode_entity (&from, end);
|
||||
if (entity != -1)
|
||||
@ -407,6 +404,10 @@ convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags
|
||||
else
|
||||
*to++ = *from++;
|
||||
}
|
||||
else if ((*from == '\n' || *from == '\r') && squash_newlines)
|
||||
++from;
|
||||
else
|
||||
*to++ = *from++;
|
||||
}
|
||||
/* Verify that we haven't exceeded the original size. (It
|
||||
shouldn't happen, hence the assert.) */
|
||||
|
@ -612,9 +612,12 @@ get_urls_html (const char *file, const char *url, int *meta_disallow_follow)
|
||||
init_interesting ();
|
||||
|
||||
/* Specify MHT_TRIM_VALUES because of buggy HTML generators that
|
||||
generate <a href=" foo"> instead of <a href="foo"> (Netscape
|
||||
ignores spaces as well.) If you really mean space, use &32; or
|
||||
%20. */
|
||||
generate <a href=" foo"> instead of <a href="foo"> (browsers
|
||||
ignore spaces as well.) If you really mean space, use &32; or
|
||||
%20. MHT_TRIM_VALUES also causes squashing of embedded newlines,
|
||||
e.g. in <img src="foo.[newline]html">. Such newlines are also
|
||||
ignored by IE and Mozilla and are presumably introduced by
|
||||
writing HTML with editors that force word wrap. */
|
||||
flags = MHT_TRIM_VALUES;
|
||||
if (opt.strict_comments)
|
||||
flags |= MHT_STRICT_COMMENTS;
|
||||
|
Loading…
Reference in New Issue
Block a user