mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Parse <img srcset> attributes, they have image URLs.
* src/convert.h: Add link_noquote_html_p to permit rewriting URLs deep inside attributes without adding extraneous quoting * src/convert.c (convert_links): Honor link_noquote_html_p * src/html_url.c (tag_handle_img): New function. Add srcset parsing.
This commit is contained in:
parent
7099f48998
commit
c28f51aadf
@ -308,7 +308,7 @@ convert_links (const char *file, struct urlpos *links)
|
|||||||
char *quoted_newname = local_quote_string (newname,
|
char *quoted_newname = local_quote_string (newname,
|
||||||
link->link_css_p);
|
link->link_css_p);
|
||||||
|
|
||||||
if (link->link_css_p)
|
if (link->link_css_p || link->link_noquote_html_p)
|
||||||
p = replace_plain (p, link->size, fp, quoted_newname);
|
p = replace_plain (p, link->size, fp, quoted_newname);
|
||||||
else if (!link->link_refresh_p)
|
else if (!link->link_refresh_p)
|
||||||
p = replace_attr (p, link->size, fp, quoted_newname);
|
p = replace_attr (p, link->size, fp, quoted_newname);
|
||||||
@ -329,7 +329,7 @@ convert_links (const char *file, struct urlpos *links)
|
|||||||
char *newname = convert_basename (p, link);
|
char *newname = convert_basename (p, link);
|
||||||
char *quoted_newname = local_quote_string (newname, link->link_css_p);
|
char *quoted_newname = local_quote_string (newname, link->link_css_p);
|
||||||
|
|
||||||
if (link->link_css_p)
|
if (link->link_css_p || link->link_noquote_html_p)
|
||||||
p = replace_plain (p, link->size, fp, quoted_newname);
|
p = replace_plain (p, link->size, fp, quoted_newname);
|
||||||
else if (!link->link_refresh_p)
|
else if (!link->link_refresh_p)
|
||||||
p = replace_attr (p, link->size, fp, quoted_newname);
|
p = replace_attr (p, link->size, fp, quoted_newname);
|
||||||
@ -352,7 +352,7 @@ convert_links (const char *file, struct urlpos *links)
|
|||||||
char *newlink = link->url->url;
|
char *newlink = link->url->url;
|
||||||
char *quoted_newlink = html_quote_string (newlink);
|
char *quoted_newlink = html_quote_string (newlink);
|
||||||
|
|
||||||
if (link->link_css_p)
|
if (link->link_css_p || link->link_noquote_html_p)
|
||||||
p = replace_plain (p, link->size, fp, newlink);
|
p = replace_plain (p, link->size, fp, newlink);
|
||||||
else if (!link->link_refresh_p)
|
else if (!link->link_refresh_p)
|
||||||
p = replace_attr (p, link->size, fp, quoted_newlink);
|
p = replace_attr (p, link->size, fp, quoted_newlink);
|
||||||
|
@ -69,6 +69,7 @@ struct urlpos {
|
|||||||
unsigned int link_base_p :1; /* the url came from <base href=...> */
|
unsigned int link_base_p :1; /* the url came from <base href=...> */
|
||||||
unsigned int link_inline_p :1; /* needed to render the page */
|
unsigned int link_inline_p :1; /* needed to render the page */
|
||||||
unsigned int link_css_p :1; /* the url came from CSS */
|
unsigned int link_css_p :1; /* the url came from CSS */
|
||||||
|
unsigned int link_noquote_html_p :1; /* from HTML, but doesn't need " */
|
||||||
unsigned int link_expect_html :1; /* expected to contain HTML */
|
unsigned int link_expect_html :1; /* expected to contain HTML */
|
||||||
unsigned int link_expect_css :1; /* expected to contain CSS */
|
unsigned int link_expect_css :1; /* expected to contain CSS */
|
||||||
|
|
||||||
|
@ -56,6 +56,7 @@ typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
|
|||||||
DECLARE_TAG_HANDLER (tag_find_urls);
|
DECLARE_TAG_HANDLER (tag_find_urls);
|
||||||
DECLARE_TAG_HANDLER (tag_handle_base);
|
DECLARE_TAG_HANDLER (tag_handle_base);
|
||||||
DECLARE_TAG_HANDLER (tag_handle_form);
|
DECLARE_TAG_HANDLER (tag_handle_form);
|
||||||
|
DECLARE_TAG_HANDLER (tag_handle_img);
|
||||||
DECLARE_TAG_HANDLER (tag_handle_link);
|
DECLARE_TAG_HANDLER (tag_handle_link);
|
||||||
DECLARE_TAG_HANDLER (tag_handle_meta);
|
DECLARE_TAG_HANDLER (tag_handle_meta);
|
||||||
|
|
||||||
@ -105,7 +106,7 @@ static struct known_tag {
|
|||||||
{ TAG_FORM, "form", tag_handle_form },
|
{ TAG_FORM, "form", tag_handle_form },
|
||||||
{ TAG_FRAME, "frame", tag_find_urls },
|
{ TAG_FRAME, "frame", tag_find_urls },
|
||||||
{ TAG_IFRAME, "iframe", tag_find_urls },
|
{ TAG_IFRAME, "iframe", tag_find_urls },
|
||||||
{ TAG_IMG, "img", tag_find_urls },
|
{ TAG_IMG, "img", tag_handle_img },
|
||||||
{ TAG_INPUT, "input", tag_find_urls },
|
{ TAG_INPUT, "input", tag_find_urls },
|
||||||
{ TAG_LAYER, "layer", tag_find_urls },
|
{ TAG_LAYER, "layer", tag_find_urls },
|
||||||
{ TAG_LINK, "link", tag_handle_link },
|
{ TAG_LINK, "link", tag_handle_link },
|
||||||
@ -183,7 +184,8 @@ static const char *additional_attributes[] = {
|
|||||||
"name", /* used by tag_handle_meta */
|
"name", /* used by tag_handle_meta */
|
||||||
"content", /* used by tag_handle_meta */
|
"content", /* used by tag_handle_meta */
|
||||||
"action", /* used by tag_handle_form */
|
"action", /* used by tag_handle_form */
|
||||||
"style" /* used by check_style_attr */
|
"style", /* used by check_style_attr */
|
||||||
|
"srcset", /* used by tag_handle_img */
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct hash_table *interesting_tags;
|
static struct hash_table *interesting_tags;
|
||||||
@ -674,6 +676,88 @@ tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Handle the IMG tag. This requires special handling for the srcset attr,
|
||||||
|
while the traditional src/lowsrc/href attributes can be handled generically.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static void
|
||||||
|
tag_handle_img (int tagid, struct taginfo *tag, struct map_context *ctx) {
|
||||||
|
int attrind;
|
||||||
|
char *srcset;
|
||||||
|
|
||||||
|
/* Use the generic approach for the attributes without special syntax. */
|
||||||
|
tag_find_urls(tagid, tag, ctx);
|
||||||
|
|
||||||
|
srcset = find_attr (tag, "srcset", &attrind);
|
||||||
|
if (srcset)
|
||||||
|
{
|
||||||
|
/* These are relative to the input text. */
|
||||||
|
int base_ind = ATTR_POS (tag,attrind,ctx);
|
||||||
|
int size = strlen (srcset);
|
||||||
|
|
||||||
|
/* These are relative to srcset. */
|
||||||
|
int offset, url_start, url_end;
|
||||||
|
|
||||||
|
/* Make sure to line up base_ind with srcset[0], not outside quotes. */
|
||||||
|
if (ctx->text[base_ind] == '"' || ctx->text[base_ind] == '\'')
|
||||||
|
++base_ind;
|
||||||
|
|
||||||
|
offset = 0;
|
||||||
|
while (offset < size)
|
||||||
|
{
|
||||||
|
bool has_descriptor = true;
|
||||||
|
|
||||||
|
/* Skip over initial whitespace and commas. Note there is no \v
|
||||||
|
in HTML5 whitespace. */
|
||||||
|
url_start = offset + strspn (srcset + offset, " \f\n\r\t,");
|
||||||
|
|
||||||
|
if (url_start == size)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* URL is any non-whitespace chars (including commas) - but with
|
||||||
|
trailing commas removed. */
|
||||||
|
url_end = url_start + strcspn (srcset + url_start, " \f\n\r\t");
|
||||||
|
while ((url_end - 1) > url_start && srcset[url_end - 1] == ',')
|
||||||
|
{
|
||||||
|
has_descriptor = false;
|
||||||
|
--url_end;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (url_end > url_start)
|
||||||
|
{
|
||||||
|
char *url_text = strdupdelim (srcset + url_start,
|
||||||
|
srcset + url_end);
|
||||||
|
struct urlpos *up = append_url (url_text, base_ind + url_start,
|
||||||
|
url_end - url_start, ctx);
|
||||||
|
up->link_inline_p = 1;
|
||||||
|
up->link_noquote_html_p = 1;
|
||||||
|
xfree (url_text);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If the URL wasn't terminated by a , there may also be a descriptor
|
||||||
|
which we just skip. */
|
||||||
|
if (has_descriptor)
|
||||||
|
{
|
||||||
|
/* This is comma-terminated, except there may be one level of
|
||||||
|
parentheses escaping that. */
|
||||||
|
bool in_paren = false;
|
||||||
|
for (offset = url_end; offset < size; ++offset)
|
||||||
|
{
|
||||||
|
char c = srcset[offset];
|
||||||
|
if (c == '(')
|
||||||
|
in_paren = true;
|
||||||
|
else if (c == ')' && in_paren)
|
||||||
|
in_paren = false;
|
||||||
|
else if (c == ',' && !in_paren)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
offset = url_end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Dispatch the tag handler appropriate for the tag we're mapping
|
/* Dispatch the tag handler appropriate for the tag we're mapping
|
||||||
over. See known_tags[] for definition of tag handlers. */
|
over. See known_tags[] for definition of tag handlers. */
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user