Add support for -accept-regex and --reject-regex.

This commit is contained in:
Gijs van Tulder 2012-05-09 21:18:23 +02:00 committed by Giuseppe Scrivano
parent 0aa3c5d33c
commit f5a1097871
10 changed files with 227 additions and 0 deletions

View File

@ -1,3 +1,8 @@
2012-04-11 Gijs van Tulder <gvtulder@gmail.com>
* bootstrap.conf (gnulib_modules): Include module `regex'.
* configure.ac: Check for PCRE library.
2012-03-25 Ray Satiro <raysatiro@yahoo.com>
* configure.ac: Fix build under mingw when OpenSSL is used.

View File

@ -58,6 +58,7 @@ pipe
quote
quotearg
recv
regex
select
send
setsockopt

View File

@ -532,6 +532,18 @@ AC_CHECK_HEADER(uuid/uuid.h,
])
)
dnl
dnl Check for PCRE
dnl
AC_CHECK_HEADER(pcre.h,
AC_CHECK_LIB(pcre, pcre_compile,
[LIBS="${LIBS} -lpcre"
AC_DEFINE([HAVE_LIBPCRE], 1,
[Define if libpcre is available.])
])
)
dnl Needed by src/Makefile.am
AM_CONDITIONAL([IRI_IS_ENABLED], [test "X$iri" != "Xno"])

View File

@ -1,3 +1,12 @@
2012-04-11 Gijs van Tulder <gvtulder@gmail.com>
* init.c: Add --accept-regex, --reject-regex and --regex-type.
* main.c: Likewise.
* options.c: Likewise.
* recur.c: Likewise.
* utils.c: Add regex-related functions.
* utils.h: Add regex-related functions.
2012-03-30 Tim Ruehsen <tim.ruehsen@gmx.de>
* convert.c (convert_links_in_hashtable): Mmake it static.

View File

@ -46,6 +46,10 @@ as that of the covered work. */
# endif
#endif
#include <regex.h>
#ifdef HAVE_LIBPCRE
# include <pcre.h>
#endif
#ifdef HAVE_PWD_H
# include <pwd.h>
@ -94,6 +98,7 @@ CMD_DECLARE (cmd_spec_mirror);
CMD_DECLARE (cmd_spec_prefer_family);
CMD_DECLARE (cmd_spec_progress);
CMD_DECLARE (cmd_spec_recursive);
CMD_DECLARE (cmd_spec_regex_type);
CMD_DECLARE (cmd_spec_restrict_file_names);
#ifdef HAVE_SSL
CMD_DECLARE (cmd_spec_secure_protocol);
@ -116,6 +121,7 @@ static const struct {
} commands[] = {
/* KEEP THIS LIST ALPHABETICALLY SORTED */
{ "accept", &opt.accepts, cmd_vector },
{ "acceptregex", &opt.acceptregex_s, cmd_string },
{ "addhostdir", &opt.add_hostdir, cmd_boolean },
{ "adjustextension", &opt.adjust_extension, cmd_boolean },
{ "alwaysrest", &opt.always_rest, cmd_boolean }, /* deprecated */
@ -236,7 +242,9 @@ static const struct {
{ "reclevel", &opt.reclevel, cmd_number_inf },
{ "recursive", NULL, cmd_spec_recursive },
{ "referer", &opt.referer, cmd_string },
{ "regextype", &opt.regex_type, cmd_spec_regex_type },
{ "reject", &opt.rejects, cmd_vector },
{ "rejectregex", &opt.rejectregex_s, cmd_string },
{ "relativeonly", &opt.relative_only, cmd_boolean },
{ "remoteencoding", &opt.encoding_remote, cmd_string },
{ "removelisting", &opt.remove_listing, cmd_boolean },
@ -361,6 +369,8 @@ defaults (void)
opt.restrict_files_nonascii = false;
opt.restrict_files_case = restrict_no_case_restriction;
opt.regex_type = regex_type_posix;
opt.max_redirect = 20;
opt.waitretry = 10;
@ -1368,6 +1378,25 @@ cmd_spec_recursive (const char *com, const char *val, void *place_ignored)
return true;
}
/* Validate --regex-type and set the choice. */
static bool
cmd_spec_regex_type (const char *com, const char *val, void *place_ignored)
{
static const struct decode_item choices[] = {
{ "posix", regex_type_posix },
#ifdef HAVE_LIBPCRE
{ "pcre", regex_type_pcre },
#endif
};
int regex_type = regex_type_posix;
int ok = decode_string (val, choices, countof (choices), &regex_type);
if (!ok)
fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com, quote (val));
opt.regex_type = regex_type;
return ok;
}
static bool
cmd_spec_restrict_file_names (const char *com, const char *val, void *place_ignored)
{

View File

@ -158,6 +158,7 @@ struct cmdline_option {
static struct cmdline_option option_data[] =
{
{ "accept", 'A', OPT_VALUE, "accept", -1 },
{ "accept-regex", 0, OPT_VALUE, "acceptregex", -1 },
{ "adjust-extension", 'E', OPT_BOOLEAN, "adjustextension", -1 },
{ "append-output", 'a', OPT__APPEND_OUTPUT, NULL, required_argument },
{ "ask-password", 0, OPT_BOOLEAN, "askpassword", -1 },
@ -262,7 +263,9 @@ static struct cmdline_option option_data[] =
{ "read-timeout", 0, OPT_VALUE, "readtimeout", -1 },
{ "recursive", 'r', OPT_BOOLEAN, "recursive", -1 },
{ "referer", 0, OPT_VALUE, "referer", -1 },
{ "regex-type", 0, OPT_VALUE, "regextype", -1 },
{ "reject", 'R', OPT_VALUE, "reject", -1 },
{ "reject-regex", 0, OPT_VALUE, "rejectregex", -1 },
{ "relative", 'L', OPT_BOOLEAN, "relativeonly", -1 },
{ "remote-encoding", 0, OPT_VALUE, "remoteencoding", -1 },
{ "remove-listing", 0, OPT_BOOLEAN, "removelisting", -1 },
@ -722,6 +725,17 @@ Recursive accept/reject:\n"),
-A, --accept=LIST comma-separated list of accepted extensions.\n"),
N_("\
-R, --reject=LIST comma-separated list of rejected extensions.\n"),
N_("\
--accept-regex=REGEX regex matching accepted URLs.\n"),
N_("\
--reject-regex=REGEX regex matching rejected URLs.\n"),
#ifdef HAVE_LIBPCRE
N_("\
--regex-type=TYPE regex type (posix|pcre).\n"),
#else
N_("\
--regex-type=TYPE regex type (posix).\n"),
#endif
N_("\
-D, --domains=LIST comma-separated list of accepted domains.\n"),
N_("\
@ -1323,6 +1337,35 @@ for details.\n\n"));
exit (1);
}
/* Compile the regular expressions. */
switch (opt.regex_type)
{
#ifdef HAVE_LIBPCRE
case regex_type_pcre:
opt.regex_compile_fun = compile_pcre_regex;
opt.regex_match_fun = match_pcre_regex;
break;
#endif
case regex_type_posix:
default:
opt.regex_compile_fun = compile_posix_regex;
opt.regex_match_fun = match_posix_regex;
break;
}
if (opt.acceptregex_s)
{
opt.acceptregex = opt.regex_compile_fun (opt.acceptregex_s);
if (!opt.acceptregex)
exit (1);
}
if (opt.rejectregex_s)
{
opt.rejectregex = opt.regex_compile_fun (opt.rejectregex_s);
if (!opt.rejectregex)
exit (1);
}
#ifdef ENABLE_IRI
if (opt.enable_iri)
{

View File

@ -74,6 +74,19 @@ struct options
bool ignore_case; /* Whether to ignore case when
matching dirs and files */
char *acceptregex_s; /* Patterns to accept (a regex string). */
char *rejectregex_s; /* Patterns to reject (a regex string). */
void *acceptregex; /* Patterns to accept (a regex struct). */
void *rejectregex; /* Patterns to reject (a regex struct). */
enum {
#ifdef HAVE_LIBPCRE
regex_type_pcre,
#endif
regex_type_posix
} regex_type; /* The regex library. */
void *(*regex_compile_fun)(const char *); /* Function to compile a regex. */
bool (*regex_match_fun)(const void *, const char *); /* Function to match a string to a regex. */
char **domains; /* See host.c */
char **exclude_domains;
bool dns_cache; /* whether we cache DNS lookups. */

View File

@ -586,6 +586,11 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
goto out;
}
}
if (!accept_url (url))
{
DEBUGP (("%s is excluded/not-included through regex.\n", url));
goto out;
}
/* 6. Check for acceptance/rejection rules. We ignore these rules
for directories (no file name to match) and for non-leaf HTMLs,

View File

@ -73,6 +73,11 @@ as that of the covered work. */
#include <signal.h>
#include <setjmp.h>
#include <regex.h>
#ifdef HAVE_LIBPCRE
# include <pcre.h>
#endif
#ifndef HAVE_SIGSETJMP
/* If sigsetjmp is a macro, configure won't pick it up. */
# ifdef sigsetjmp
@ -917,6 +922,19 @@ acceptable (const char *s)
return true;
}
/* Determine whether an URL is acceptable to be followed, according to
regex patterns to accept/reject. */
bool
accept_url (const char *s)
{
if (opt.acceptregex && !opt.regex_match_fun (opt.acceptregex, s))
return false;
if (opt.rejectregex && opt.regex_match_fun (opt.rejectregex, s))
return false;
return true;
}
/* Check if D2 is a subdirectory of D1. E.g. if D1 is `/something', subdir_p()
will return true if and only if D2 begins with `/something/' or is exactly
'/something'. */
@ -2309,6 +2327,89 @@ base64_decode (const char *base64, void *dest)
return q - (char *) dest;
}
#ifdef HAVE_LIBPCRE
/* Compiles the PCRE regex. */
void *
compile_pcre_regex (const char *str)
{
const char *errbuf;
int erroffset;
pcre *regex = pcre_compile (str, 0, &errbuf, &erroffset, 0);
if (! regex)
{
fprintf (stderr, _("Invalid regular expression %s, %s\n"),
quote (str), errbuf);
return false;
}
return regex;
}
#endif
/* Compiles the POSIX regex. */
void *
compile_posix_regex (const char *str)
{
regex_t *regex = xmalloc (sizeof (regex_t));
int errcode = regcomp ((regex_t *) regex, str, REG_EXTENDED | REG_NOSUB);
if (errcode != 0)
{
int errbuf_size = regerror (errcode, (regex_t *) regex, NULL, 0);
char *errbuf = xmalloc (errbuf_size);
errbuf_size = regerror (errcode, (regex_t *) regex, errbuf, errbuf_size);
fprintf (stderr, _("Invalid regular expression %s, %s\n"),
quote (str), errbuf);
xfree (errbuf);
return NULL;
}
return regex;
}
#ifdef HAVE_LIBPCRE
#define OVECCOUNT 30
/* Matches a PCRE regex. */
bool
match_pcre_regex (const void *regex, const char *str)
{
int l = strlen (str);
int ovector[OVECCOUNT];
int rc = pcre_exec ((pcre *) regex, 0, str, l, 0, 0, ovector, OVECCOUNT);
if (rc == PCRE_ERROR_NOMATCH)
return false;
else if (rc < 0)
{
logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
quote (str), rc);
return false;
}
else
return true;
}
#undef OVECCOUNT
#endif
/* Matches a POSIX regex. */
bool
match_posix_regex (const void *regex, const char *str)
{
int rc = regexec ((regex_t *) regex, str, 0, NULL, 0);
if (rc == REG_NOMATCH)
return false;
else if (rc == 0)
return true;
else
{
int errbuf_size = regerror (rc, opt.acceptregex, NULL, 0);
char *errbuf = xmalloc (errbuf_size);
errbuf_size = regerror (rc, opt.acceptregex, errbuf, errbuf_size);
logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
quote (str), rc);
xfree (errbuf);
return false;
}
}
#undef IS_ASCII
#undef NEXT_CHAR

View File

@ -90,6 +90,7 @@ char *file_merge (const char *, const char *);
int fnmatch_nocase (const char *, const char *, int);
bool acceptable (const char *);
bool accept_url (const char *);
bool accdir (const char *s);
char *suffix (const char *s);
bool match_tail (const char *, const char *, bool);
@ -142,6 +143,14 @@ void xsleep (double);
int base64_encode (const void *, int, char *);
int base64_decode (const char *, void *);
#ifdef HAVE_LIBPCRE
void *compile_pcre_regex (const char *);
bool match_pcre_regex (const void *, const char *);
#endif
void *compile_posix_regex (const char *);
bool match_posix_regex (const void *, const char *);
void stable_sort (void *, size_t, size_t, int (*) (const void *, const void *));
const char *print_decimal (double);