mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
Add support for -accept-regex and --reject-regex.
This commit is contained in:
parent
0aa3c5d33c
commit
f5a1097871
@ -1,3 +1,8 @@
|
|||||||
|
2012-04-11 Gijs van Tulder <gvtulder@gmail.com>
|
||||||
|
|
||||||
|
* bootstrap.conf (gnulib_modules): Include module `regex'.
|
||||||
|
* configure.ac: Check for PCRE library.
|
||||||
|
|
||||||
2012-03-25 Ray Satiro <raysatiro@yahoo.com>
|
2012-03-25 Ray Satiro <raysatiro@yahoo.com>
|
||||||
|
|
||||||
* configure.ac: Fix build under mingw when OpenSSL is used.
|
* configure.ac: Fix build under mingw when OpenSSL is used.
|
||||||
|
@ -58,6 +58,7 @@ pipe
|
|||||||
quote
|
quote
|
||||||
quotearg
|
quotearg
|
||||||
recv
|
recv
|
||||||
|
regex
|
||||||
select
|
select
|
||||||
send
|
send
|
||||||
setsockopt
|
setsockopt
|
||||||
|
12
configure.ac
12
configure.ac
@ -532,6 +532,18 @@ AC_CHECK_HEADER(uuid/uuid.h,
|
|||||||
])
|
])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
dnl
|
||||||
|
dnl Check for PCRE
|
||||||
|
dnl
|
||||||
|
|
||||||
|
AC_CHECK_HEADER(pcre.h,
|
||||||
|
AC_CHECK_LIB(pcre, pcre_compile,
|
||||||
|
[LIBS="${LIBS} -lpcre"
|
||||||
|
AC_DEFINE([HAVE_LIBPCRE], 1,
|
||||||
|
[Define if libpcre is available.])
|
||||||
|
])
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
dnl Needed by src/Makefile.am
|
dnl Needed by src/Makefile.am
|
||||||
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
|
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])
|
||||||
|
@ -1,3 +1,12 @@
|
|||||||
|
2012-04-11 Gijs van Tulder <gvtulder@gmail.com>
|
||||||
|
|
||||||
|
* init.c: Add --accept-regex, --reject-regex and --regex-type.
|
||||||
|
* main.c: Likewise.
|
||||||
|
* options.c: Likewise.
|
||||||
|
* recur.c: Likewise.
|
||||||
|
* utils.c: Add regex-related functions.
|
||||||
|
* utils.h: Add regex-related functions.
|
||||||
|
|
||||||
2012-03-30 Tim Ruehsen <tim.ruehsen@gmx.de>
|
2012-03-30 Tim Ruehsen <tim.ruehsen@gmx.de>
|
||||||
|
|
||||||
* convert.c (convert_links_in_hashtable): Mmake it static.
|
* convert.c (convert_links_in_hashtable): Mmake it static.
|
||||||
|
29
src/init.c
29
src/init.c
@ -46,6 +46,10 @@ as that of the covered work. */
|
|||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <regex.h>
|
||||||
|
#ifdef HAVE_LIBPCRE
|
||||||
|
# include <pcre.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE_PWD_H
|
#ifdef HAVE_PWD_H
|
||||||
# include <pwd.h>
|
# include <pwd.h>
|
||||||
@ -94,6 +98,7 @@ CMD_DECLARE (cmd_spec_mirror);
|
|||||||
CMD_DECLARE (cmd_spec_prefer_family);
|
CMD_DECLARE (cmd_spec_prefer_family);
|
||||||
CMD_DECLARE (cmd_spec_progress);
|
CMD_DECLARE (cmd_spec_progress);
|
||||||
CMD_DECLARE (cmd_spec_recursive);
|
CMD_DECLARE (cmd_spec_recursive);
|
||||||
|
CMD_DECLARE (cmd_spec_regex_type);
|
||||||
CMD_DECLARE (cmd_spec_restrict_file_names);
|
CMD_DECLARE (cmd_spec_restrict_file_names);
|
||||||
#ifdef HAVE_SSL
|
#ifdef HAVE_SSL
|
||||||
CMD_DECLARE (cmd_spec_secure_protocol);
|
CMD_DECLARE (cmd_spec_secure_protocol);
|
||||||
@ -116,6 +121,7 @@ static const struct {
|
|||||||
} commands[] = {
|
} commands[] = {
|
||||||
/* KEEP THIS LIST ALPHABETICALLY SORTED */
|
/* KEEP THIS LIST ALPHABETICALLY SORTED */
|
||||||
{ "accept", &opt.accepts, cmd_vector },
|
{ "accept", &opt.accepts, cmd_vector },
|
||||||
|
{ "acceptregex", &opt.acceptregex_s, cmd_string },
|
||||||
{ "addhostdir", &opt.add_hostdir, cmd_boolean },
|
{ "addhostdir", &opt.add_hostdir, cmd_boolean },
|
||||||
{ "adjustextension", &opt.adjust_extension, cmd_boolean },
|
{ "adjustextension", &opt.adjust_extension, cmd_boolean },
|
||||||
{ "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */
|
{ "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */
|
||||||
@ -236,7 +242,9 @@ static const struct {
|
|||||||
{ "reclevel", &opt.reclevel, cmd_number_inf },
|
{ "reclevel", &opt.reclevel, cmd_number_inf },
|
||||||
{ "recursive", NULL, cmd_spec_recursive },
|
{ "recursive", NULL, cmd_spec_recursive },
|
||||||
{ "referer", &opt.referer, cmd_string },
|
{ "referer", &opt.referer, cmd_string },
|
||||||
|
{ "regextype", &opt.regex_type, cmd_spec_regex_type },
|
||||||
{ "reject", &opt.rejects, cmd_vector },
|
{ "reject", &opt.rejects, cmd_vector },
|
||||||
|
{ "rejectregex", &opt.rejectregex_s, cmd_string },
|
||||||
{ "relativeonly", &opt.relative_only, cmd_boolean },
|
{ "relativeonly", &opt.relative_only, cmd_boolean },
|
||||||
{ "remoteencoding", &opt.encoding_remote, cmd_string },
|
{ "remoteencoding", &opt.encoding_remote, cmd_string },
|
||||||
{ "removelisting", &opt.remove_listing, cmd_boolean },
|
{ "removelisting", &opt.remove_listing, cmd_boolean },
|
||||||
@ -361,6 +369,8 @@ defaults (void)
|
|||||||
opt.restrict_files_nonascii = false;
|
opt.restrict_files_nonascii = false;
|
||||||
opt.restrict_files_case = restrict_no_case_restriction;
|
opt.restrict_files_case = restrict_no_case_restriction;
|
||||||
|
|
||||||
|
opt.regex_type = regex_type_posix;
|
||||||
|
|
||||||
opt.max_redirect = 20;
|
opt.max_redirect = 20;
|
||||||
|
|
||||||
opt.waitretry = 10;
|
opt.waitretry = 10;
|
||||||
@ -1368,6 +1378,25 @@ cmd_spec_recursive (const char *com, const char *val, void *place_ignored)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Validate --regex-type and set the choice. */
|
||||||
|
|
||||||
|
static bool
|
||||||
|
cmd_spec_regex_type (const char *com, const char *val, void *place_ignored)
|
||||||
|
{
|
||||||
|
static const struct decode_item choices[] = {
|
||||||
|
{ "posix", regex_type_posix },
|
||||||
|
#ifdef HAVE_LIBPCRE
|
||||||
|
{ "pcre", regex_type_pcre },
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
int regex_type = regex_type_posix;
|
||||||
|
int ok = decode_string (val, choices, countof (choices), ®ex_type);
|
||||||
|
if (!ok)
|
||||||
|
fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val));
|
||||||
|
opt.regex_type = regex_type;
|
||||||
|
return ok;
|
||||||
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
cmd_spec_restrict_file_names (const char *com, const char *val, void *place_ignored)
|
cmd_spec_restrict_file_names (const char *com, const char *val, void *place_ignored)
|
||||||
{
|
{
|
||||||
|
43
src/main.c
43
src/main.c
@ -158,6 +158,7 @@ struct cmdline_option {
|
|||||||
static struct cmdline_option option_data[] =
|
static struct cmdline_option option_data[] =
|
||||||
{
|
{
|
||||||
{ "accept", 'A', OPT_VALUE, "accept", -1 },
|
{ "accept", 'A', OPT_VALUE, "accept", -1 },
|
||||||
|
{ "accept-regex", 0, OPT_VALUE, "acceptregex", -1 },
|
||||||
{ "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 },
|
{ "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 },
|
||||||
{ "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument },
|
{ "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument },
|
||||||
{ "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 },
|
{ "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 },
|
||||||
@ -262,7 +263,9 @@ static struct cmdline_option option_data[] =
|
|||||||
{ "read-timeout", 0, OPT_VALUE, "readtimeout", -1 },
|
{ "read-timeout", 0, OPT_VALUE, "readtimeout", -1 },
|
||||||
{ "recursive", 'r', OPT_BOOLEAN, "recursive", -1 },
|
{ "recursive", 'r', OPT_BOOLEAN, "recursive", -1 },
|
||||||
{ "referer", 0, OPT_VALUE, "referer", -1 },
|
{ "referer", 0, OPT_VALUE, "referer", -1 },
|
||||||
|
{ "regex-type", 0, OPT_VALUE, "regextype", -1 },
|
||||||
{ "reject", 'R', OPT_VALUE, "reject", -1 },
|
{ "reject", 'R', OPT_VALUE, "reject", -1 },
|
||||||
|
{ "reject-regex", 0, OPT_VALUE, "rejectregex", -1 },
|
||||||
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
|
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
|
||||||
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
|
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
|
||||||
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
|
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
|
||||||
@ -722,6 +725,17 @@ Recursive accept/reject:\n"),
|
|||||||
-A, --accept=LIST comma-separated list of accepted extensions.\n"),
|
-A, --accept=LIST comma-separated list of accepted extensions.\n"),
|
||||||
N_("\
|
N_("\
|
||||||
-R, --reject=LIST comma-separated list of rejected extensions.\n"),
|
-R, --reject=LIST comma-separated list of rejected extensions.\n"),
|
||||||
|
N_("\
|
||||||
|
--accept-regex=REGEX regex matching accepted URLs.\n"),
|
||||||
|
N_("\
|
||||||
|
--reject-regex=REGEX regex matching rejected URLs.\n"),
|
||||||
|
#ifdef HAVE_LIBPCRE
|
||||||
|
N_("\
|
||||||
|
--regex-type=TYPE regex type (posix|pcre).\n"),
|
||||||
|
#else
|
||||||
|
N_("\
|
||||||
|
--regex-type=TYPE regex type (posix).\n"),
|
||||||
|
#endif
|
||||||
N_("\
|
N_("\
|
||||||
-D, --domains=LIST comma-separated list of accepted domains.\n"),
|
-D, --domains=LIST comma-separated list of accepted domains.\n"),
|
||||||
N_("\
|
N_("\
|
||||||
@ -1323,6 +1337,35 @@ for details.\n\n"));
|
|||||||
exit (1);
|
exit (1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Compile the regular expressions. */
|
||||||
|
switch (opt.regex_type)
|
||||||
|
{
|
||||||
|
#ifdef HAVE_LIBPCRE
|
||||||
|
case regex_type_pcre:
|
||||||
|
opt.regex_compile_fun = compile_pcre_regex;
|
||||||
|
opt.regex_match_fun = match_pcre_regex;
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
case regex_type_posix:
|
||||||
|
default:
|
||||||
|
opt.regex_compile_fun = compile_posix_regex;
|
||||||
|
opt.regex_match_fun = match_posix_regex;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (opt.acceptregex_s)
|
||||||
|
{
|
||||||
|
opt.acceptregex = opt.regex_compile_fun (opt.acceptregex_s);
|
||||||
|
if (!opt.acceptregex)
|
||||||
|
exit (1);
|
||||||
|
}
|
||||||
|
if (opt.rejectregex_s)
|
||||||
|
{
|
||||||
|
opt.rejectregex = opt.regex_compile_fun (opt.rejectregex_s);
|
||||||
|
if (!opt.rejectregex)
|
||||||
|
exit (1);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef ENABLE_IRI
|
#ifdef ENABLE_IRI
|
||||||
if (opt.enable_iri)
|
if (opt.enable_iri)
|
||||||
{
|
{
|
||||||
|
@ -74,6 +74,19 @@ struct options
|
|||||||
bool ignore_case; /* Whether to ignore case when
|
bool ignore_case; /* Whether to ignore case when
|
||||||
matching dirs and files */
|
matching dirs and files */
|
||||||
|
|
||||||
|
char *acceptregex_s; /* Patterns to accept (a regex string). */
|
||||||
|
char *rejectregex_s; /* Patterns to reject (a regex string). */
|
||||||
|
void *acceptregex; /* Patterns to accept (a regex struct). */
|
||||||
|
void *rejectregex; /* Patterns to reject (a regex struct). */
|
||||||
|
enum {
|
||||||
|
#ifdef HAVE_LIBPCRE
|
||||||
|
regex_type_pcre,
|
||||||
|
#endif
|
||||||
|
regex_type_posix
|
||||||
|
} regex_type; /* The regex library. */
|
||||||
|
void *(*regex_compile_fun)(const char *); /* Function to compile a regex. */
|
||||||
|
bool (*regex_match_fun)(const void *, const char *); /* Function to match a string to a regex. */
|
||||||
|
|
||||||
char **domains; /* See host.c */
|
char **domains; /* See host.c */
|
||||||
char **exclude_domains;
|
char **exclude_domains;
|
||||||
bool dns_cache; /* whether we cache DNS lookups. */
|
bool dns_cache; /* whether we cache DNS lookups. */
|
||||||
|
@ -586,6 +586,11 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (!accept_url (url))
|
||||||
|
{
|
||||||
|
DEBUGP (("%s is excluded/not-included through regex.\n", url));
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
/* 6. Check for acceptance/rejection rules. We ignore these rules
|
/* 6. Check for acceptance/rejection rules. We ignore these rules
|
||||||
for directories (no file name to match) and for non-leaf HTMLs,
|
for directories (no file name to match) and for non-leaf HTMLs,
|
||||||
|
101
src/utils.c
101
src/utils.c
@ -73,6 +73,11 @@ as that of the covered work. */
|
|||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#include <setjmp.h>
|
#include <setjmp.h>
|
||||||
|
|
||||||
|
#include <regex.h>
|
||||||
|
#ifdef HAVE_LIBPCRE
|
||||||
|
# include <pcre.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_SIGSETJMP
|
#ifndef HAVE_SIGSETJMP
|
||||||
/* If sigsetjmp is a macro, configure won't pick it up. */
|
/* If sigsetjmp is a macro, configure won't pick it up. */
|
||||||
# ifdef sigsetjmp
|
# ifdef sigsetjmp
|
||||||
@ -917,6 +922,19 @@ acceptable (const char *s)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Determine whether an URL is acceptable to be followed, according to
|
||||||
|
regex patterns to accept/reject. */
|
||||||
|
bool
|
||||||
|
accept_url (const char *s)
|
||||||
|
{
|
||||||
|
if (opt.acceptregex && !opt.regex_match_fun (opt.acceptregex, s))
|
||||||
|
return false;
|
||||||
|
if (opt.rejectregex && opt.regex_match_fun (opt.rejectregex, s))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/* Check if D2 is a subdirectory of D1. E.g. if D1 is `/something', subdir_p()
|
/* Check if D2 is a subdirectory of D1. E.g. if D1 is `/something', subdir_p()
|
||||||
will return true if and only if D2 begins with `/something/' or is exactly
|
will return true if and only if D2 begins with `/something/' or is exactly
|
||||||
'/something'. */
|
'/something'. */
|
||||||
@ -2309,6 +2327,89 @@ base64_decode (const char *base64, void *dest)
|
|||||||
return q - (char *) dest;
|
return q - (char *) dest;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef HAVE_LIBPCRE
|
||||||
|
/* Compiles the PCRE regex. */
|
||||||
|
void *
|
||||||
|
compile_pcre_regex (const char *str)
|
||||||
|
{
|
||||||
|
const char *errbuf;
|
||||||
|
int erroffset;
|
||||||
|
pcre *regex = pcre_compile (str, 0, &errbuf, &erroffset, 0);
|
||||||
|
if (! regex)
|
||||||
|
{
|
||||||
|
fprintf (stderr, _("Invalid regular expression %s, %s\n"),
|
||||||
|
quote (str), errbuf);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return regex;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Compiles the POSIX regex. */
|
||||||
|
void *
|
||||||
|
compile_posix_regex (const char *str)
|
||||||
|
{
|
||||||
|
regex_t *regex = xmalloc (sizeof (regex_t));
|
||||||
|
int errcode = regcomp ((regex_t *) regex, str, REG_EXTENDED | REG_NOSUB);
|
||||||
|
if (errcode != 0)
|
||||||
|
{
|
||||||
|
int errbuf_size = regerror (errcode, (regex_t *) regex, NULL, 0);
|
||||||
|
char *errbuf = xmalloc (errbuf_size);
|
||||||
|
errbuf_size = regerror (errcode, (regex_t *) regex, errbuf, errbuf_size);
|
||||||
|
fprintf (stderr, _("Invalid regular expression %s, %s\n"),
|
||||||
|
quote (str), errbuf);
|
||||||
|
xfree (errbuf);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return regex;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef HAVE_LIBPCRE
|
||||||
|
#define OVECCOUNT 30
|
||||||
|
/* Matches a PCRE regex. */
|
||||||
|
bool
|
||||||
|
match_pcre_regex (const void *regex, const char *str)
|
||||||
|
{
|
||||||
|
int l = strlen (str);
|
||||||
|
int ovector[OVECCOUNT];
|
||||||
|
|
||||||
|
int rc = pcre_exec ((pcre *) regex, 0, str, l, 0, 0, ovector, OVECCOUNT);
|
||||||
|
if (rc == PCRE_ERROR_NOMATCH)
|
||||||
|
return false;
|
||||||
|
else if (rc < 0)
|
||||||
|
{
|
||||||
|
logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
|
||||||
|
quote (str), rc);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#undef OVECCOUNT
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Matches a POSIX regex. */
|
||||||
|
bool
|
||||||
|
match_posix_regex (const void *regex, const char *str)
|
||||||
|
{
|
||||||
|
int rc = regexec ((regex_t *) regex, str, 0, NULL, 0);
|
||||||
|
if (rc == REG_NOMATCH)
|
||||||
|
return false;
|
||||||
|
else if (rc == 0)
|
||||||
|
return true;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int errbuf_size = regerror (rc, opt.acceptregex, NULL, 0);
|
||||||
|
char *errbuf = xmalloc (errbuf_size);
|
||||||
|
errbuf_size = regerror (rc, opt.acceptregex, errbuf, errbuf_size);
|
||||||
|
logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
|
||||||
|
quote (str), rc);
|
||||||
|
xfree (errbuf);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#undef IS_ASCII
|
#undef IS_ASCII
|
||||||
#undef NEXT_CHAR
|
#undef NEXT_CHAR
|
||||||
|
|
||||||
|
@ -90,6 +90,7 @@ char *file_merge (const char *, const char *);
|
|||||||
|
|
||||||
int fnmatch_nocase (const char *, const char *, int);
|
int fnmatch_nocase (const char *, const char *, int);
|
||||||
bool acceptable (const char *);
|
bool acceptable (const char *);
|
||||||
|
bool accept_url (const char *);
|
||||||
bool accdir (const char *s);
|
bool accdir (const char *s);
|
||||||
char *suffix (const char *s);
|
char *suffix (const char *s);
|
||||||
bool match_tail (const char *, const char *, bool);
|
bool match_tail (const char *, const char *, bool);
|
||||||
@ -142,6 +143,14 @@ void xsleep (double);
|
|||||||
int base64_encode (const void *, int, char *);
|
int base64_encode (const void *, int, char *);
|
||||||
int base64_decode (const char *, void *);
|
int base64_decode (const char *, void *);
|
||||||
|
|
||||||
|
#ifdef HAVE_LIBPCRE
|
||||||
|
void *compile_pcre_regex (const char *);
|
||||||
|
bool match_pcre_regex (const void *, const char *);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void *compile_posix_regex (const char *);
|
||||||
|
bool match_posix_regex (const void *, const char *);
|
||||||
|
|
||||||
void stable_sort (void *, size_t, size_t, int (*) (const void *, const void *));
|
void stable_sort (void *, size_t, size_t, int (*) (const void *, const void *));
|
||||||
|
|
||||||
const char *print_decimal (double);
|
const char *print_decimal (double);
|
||||||
|
Loading…
Reference in New Issue
Block a user