1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

Parse <img srcset> attributes, they have image URLs.

* src/convert.h: Add link_noquote_html_p to permit rewriting URLs deep
                 inside attributes without adding extraneous quoting
* src/convert.c (convert_links): Honor link_noquote_html_p
* src/html_url.c (tag_handle_img): New function. Add srcset parsing.
This commit is contained in:
Maks Orlovich 2016-03-01 09:43:56 -05:00 committed by Giuseppe Scrivano
parent 7099f48998
commit c28f51aadf
3 changed files with 90 additions and 5 deletions

View File

@ -308,7 +308,7 @@ convert_links (const char *file, struct urlpos *links)
char *quoted_newname = local_quote_string (newname,
link->link_css_p);
if (link->link_css_p)
if (link->link_css_p || link->link_noquote_html_p)
p = replace_plain (p, link->size, fp, quoted_newname);
else if (!link->link_refresh_p)
p = replace_attr (p, link->size, fp, quoted_newname);
@ -329,7 +329,7 @@ convert_links (const char *file, struct urlpos *links)
char *newname = convert_basename (p, link);
char *quoted_newname = local_quote_string (newname, link->link_css_p);
if (link->link_css_p)
if (link->link_css_p || link->link_noquote_html_p)
p = replace_plain (p, link->size, fp, quoted_newname);
else if (!link->link_refresh_p)
p = replace_attr (p, link->size, fp, quoted_newname);
@ -352,7 +352,7 @@ convert_links (const char *file, struct urlpos *links)
char *newlink = link->url->url;
char *quoted_newlink = html_quote_string (newlink);
if (link->link_css_p)
if (link->link_css_p || link->link_noquote_html_p)
p = replace_plain (p, link->size, fp, newlink);
else if (!link->link_refresh_p)
p = replace_attr (p, link->size, fp, quoted_newlink);

View File

@ -69,6 +69,7 @@ struct urlpos {
unsigned int link_base_p :1; /* the url came from <base href=...> */
unsigned int link_inline_p :1; /* needed to render the page */
unsigned int link_css_p :1; /* the url came from CSS */
unsigned int link_noquote_html_p :1; /* from HTML, but doesn't need " */
unsigned int link_expect_html :1; /* expected to contain HTML */
unsigned int link_expect_css :1; /* expected to contain CSS */

View File

@ -56,6 +56,7 @@ typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
DECLARE_TAG_HANDLER (tag_find_urls);
DECLARE_TAG_HANDLER (tag_handle_base);
DECLARE_TAG_HANDLER (tag_handle_form);
DECLARE_TAG_HANDLER (tag_handle_img);
DECLARE_TAG_HANDLER (tag_handle_link);
DECLARE_TAG_HANDLER (tag_handle_meta);
@ -105,7 +106,7 @@ static struct known_tag {
{ TAG_FORM, "form", tag_handle_form },
{ TAG_FRAME, "frame", tag_find_urls },
{ TAG_IFRAME, "iframe", tag_find_urls },
{ TAG_IMG, "img", tag_find_urls },
{ TAG_IMG, "img", tag_handle_img },
{ TAG_INPUT, "input", tag_find_urls },
{ TAG_LAYER, "layer", tag_find_urls },
{ TAG_LINK, "link", tag_handle_link },
@ -183,7 +184,8 @@ static const char *additional_attributes[] = {
"name", /* used by tag_handle_meta */
"content", /* used by tag_handle_meta */
"action", /* used by tag_handle_form */
"style" /* used by check_style_attr */
"style", /* used by check_style_attr */
"srcset", /* used by tag_handle_img */
};
static struct hash_table *interesting_tags;
@ -674,6 +676,88 @@ tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *
}
}
/* Handle the IMG tag. This requires special handling for the srcset attr,
while the traditional src/lowsrc/href attributes can be handled generically.
*/
static void
tag_handle_img (int tagid, struct taginfo *tag, struct map_context *ctx) {
int attrind;
char *srcset;
/* Use the generic approach for the attributes without special syntax. */
tag_find_urls(tagid, tag, ctx);
srcset = find_attr (tag, "srcset", &attrind);
if (srcset)
{
/* These are relative to the input text. */
int base_ind = ATTR_POS (tag,attrind,ctx);
int size = strlen (srcset);
/* These are relative to srcset. */
int offset, url_start, url_end;
/* Make sure to line up base_ind with srcset[0], not outside quotes. */
if (ctx->text[base_ind] == '"' || ctx->text[base_ind] == '\'')
++base_ind;
offset = 0;
while (offset < size)
{
bool has_descriptor = true;
/* Skip over initial whitespace and commas. Note there is no \v
in HTML5 whitespace. */
url_start = offset + strspn (srcset + offset, " \f\n\r\t,");
if (url_start == size)
return;
/* URL is any non-whitespace chars (including commas) - but with
trailing commas removed. */
url_end = url_start + strcspn (srcset + url_start, " \f\n\r\t");
while ((url_end - 1) > url_start && srcset[url_end - 1] == ',')
{
has_descriptor = false;
--url_end;
}
if (url_end > url_start)
{
char *url_text = strdupdelim (srcset + url_start,
srcset + url_end);
struct urlpos *up = append_url (url_text, base_ind + url_start,
url_end - url_start, ctx);
up->link_inline_p = 1;
up->link_noquote_html_p = 1;
xfree (url_text);
}
/* If the URL wasn't terminated by a , there may also be a descriptor
which we just skip. */
if (has_descriptor)
{
/* This is comma-terminated, except there may be one level of
parentheses escaping that. */
bool in_paren = false;
for (offset = url_end; offset < size; ++offset)
{
char c = srcset[offset];
if (c == '(')
in_paren = true;
else if (c == ')' && in_paren)
in_paren = false;
else if (c == ',' && !in_paren)
break;
}
}
else
offset = url_end;
}
}
}
/* Dispatch the tag handler appropriate for the tag we're mapping
over. See known_tags[] for definition of tag handlers. */