mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Fixes for recursive spider mode.
This commit is contained in:
parent
79f66dfd15
commit
60c88ee992
@ -1,3 +1,22 @@
|
||||
2006-06-28 Mauro Tortonesi <mauro@ferrara.linux.it>
|
||||
|
||||
* res.c: Implemented is_robots_txt_url function for detection of
|
||||
robots.txt URLs and related test routine.
|
||||
|
||||
* res.h: Ditto.
|
||||
|
||||
* url.c: Implemented are_urls_equal function for URL comparison and
|
||||
related testing routine.
|
||||
|
||||
* url.h: Ditto.
|
||||
|
||||
* convert.c: Fixes for recursive spider mode: don't consider
|
||||
non-existing robots.txt as a broken link, and use are_urls_equal
|
||||
instead of strcasecmp for referrer URLs comparison.
|
||||
|
||||
* test.c: Call tests routines for are_urls_equal and
|
||||
is_robots_txt_url.
|
||||
|
||||
2006-06-26 Hrvoje Niksic <hniksic@xemacs.org>
|
||||
|
||||
* wget.h (wgint): Typedef to any 64-bit (or larger) type we can
|
||||
|
@ -45,6 +45,7 @@ so, delete this exception statement from your version. */
|
||||
#include "utils.h"
|
||||
#include "hash.h"
|
||||
#include "ptimer.h"
|
||||
#include "res.h"
|
||||
|
||||
static struct hash_table *dl_file_url_map;
|
||||
struct hash_table *dl_url_file_map;
|
||||
@ -972,8 +973,8 @@ in_list (const struct broken_urls_list *list, const char *url)
|
||||
|
||||
for (ptr = list; ptr; ptr = ptr->next)
|
||||
{
|
||||
/* TODO: strcasecmp may not be appropriate to compare URLs */
|
||||
if (strcasecmp (url, ptr->url) == 0) return true;
|
||||
/* str[case]cmp is inadequate for URL comparison */
|
||||
if (are_urls_equal (url, ptr->url) == 0) return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
@ -984,6 +985,10 @@ nonexisting_url (const char *url, const char *referrer)
|
||||
{
|
||||
struct broken_urls_list *list;
|
||||
|
||||
/* Ignore robots.txt URLs */
|
||||
if (is_robots_txt_url (url))
|
||||
return;
|
||||
|
||||
if (!nonexisting_urls_hash)
|
||||
nonexisting_urls_hash = make_string_hash_table (0);
|
||||
|
||||
@ -1140,3 +1145,8 @@ html_quote_string (const char *s)
|
||||
*p = '\0';
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
* vim: et ts=2 sw=2
|
||||
*/
|
||||
|
||||
|
46
src/res.c
46
src/res.c
@ -84,6 +84,10 @@ so, delete this exception statement from your version. */
|
||||
#include "retr.h"
|
||||
#include "res.h"
|
||||
|
||||
#ifdef TESTING
|
||||
#include "test.h"
|
||||
#endif
|
||||
|
||||
struct path_info {
|
||||
char *path;
|
||||
bool allowedp;
|
||||
@ -552,6 +556,17 @@ res_retrieve_file (const char *url, char **file)
|
||||
return err == RETROK;
|
||||
}
|
||||
|
||||
bool
|
||||
is_robots_txt_url (const char *url)
|
||||
{
|
||||
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
|
||||
bool ret = are_urls_equal (url, robots_url);
|
||||
|
||||
xfree (robots_url);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
res_cleanup (void)
|
||||
{
|
||||
@ -569,3 +584,34 @@ res_cleanup (void)
|
||||
registered_specs = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef TESTING
|
||||
|
||||
const char *
|
||||
test_is_robots_txt_url()
|
||||
{
|
||||
int i;
|
||||
struct {
|
||||
char *url;
|
||||
bool expected_result;
|
||||
} test_array[] = {
|
||||
{ "http://www.yoyodyne.com/robots.txt", true },
|
||||
{ "http://www.yoyodyne.com/somepath/", false },
|
||||
{ "http://www.yoyodyne.com/somepath/robots.txt", false },
|
||||
};
|
||||
|
||||
for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
|
||||
{
|
||||
mu_assert ("test_is_robots_txt_url: wrong result",
|
||||
is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#endif /* TESTING */
|
||||
|
||||
/*
|
||||
* vim: et ts=2 sw=2
|
||||
*/
|
||||
|
||||
|
@ -42,6 +42,8 @@ struct robot_specs *res_get_specs (const char *, int);
|
||||
|
||||
bool res_retrieve_file (const char *, char **);
|
||||
|
||||
bool is_robots_txt_url (const char *);
|
||||
|
||||
void res_cleanup (void);
|
||||
|
||||
#endif /* RES_H */
|
||||
|
@ -40,6 +40,8 @@ const char *test_subdir_p();
|
||||
const char *test_dir_matches_p();
|
||||
const char *test_cmd_spec_restrict_file_names();
|
||||
const char *test_append_uri_pathel();
|
||||
const char *test_are_urls_equal();
|
||||
const char *test_is_robots_txt_url();
|
||||
|
||||
int tests_run;
|
||||
|
||||
@ -51,6 +53,8 @@ all_tests()
|
||||
mu_run_test (test_dir_matches_p);
|
||||
mu_run_test (test_cmd_spec_restrict_file_names);
|
||||
mu_run_test (test_append_uri_pathel);
|
||||
mu_run_test (test_are_urls_equal);
|
||||
mu_run_test (test_is_robots_txt_url);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
86
src/url.c
86
src/url.c
@ -1926,6 +1926,64 @@ schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
|
||||
return false;
|
||||
}
|
||||
|
||||
static int
|
||||
getchar_from_escaped_string (const char *str, char *c)
|
||||
{
|
||||
const char *p = str;
|
||||
|
||||
assert (str && *str);
|
||||
assert (c);
|
||||
|
||||
if (p[0] == '%')
|
||||
{
|
||||
if (p[1] == 0)
|
||||
return 0; /* error: invalid string */
|
||||
|
||||
if (p[1] == '%')
|
||||
{
|
||||
*c = '%';
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (p[2] == 0)
|
||||
return 0; /* error: invalid string */
|
||||
|
||||
*c = X2DIGITS_TO_NUM (p[1], p[2]);
|
||||
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*c = p[0];
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool
|
||||
are_urls_equal (const char *u1, const char *u2)
|
||||
{
|
||||
const char *p, *q;
|
||||
int pp, qq;
|
||||
char ch1, ch2;
|
||||
|
||||
p = u1;
|
||||
q = u2;
|
||||
|
||||
while (*p
|
||||
&& (pp = getchar_from_escaped_string (p, &ch1))
|
||||
&& (qq = getchar_from_escaped_string (q, &ch2))
|
||||
&& (TOLOWER(ch1) == TOLOWER(ch2)))
|
||||
{
|
||||
p += pp;
|
||||
q += qq;
|
||||
}
|
||||
|
||||
return (*p == 0 && *q == 0 ? true : false);
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* Debugging and testing support for path_simplify. */
|
||||
|
||||
@ -2036,5 +2094,33 @@ test_append_uri_pathel()
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const char*
|
||||
test_are_urls_equal()
|
||||
{
|
||||
int i;
|
||||
struct {
|
||||
char *url1;
|
||||
char *url2;
|
||||
bool expected_result;
|
||||
} test_array[] = {
|
||||
{ "http://www.adomain.com/apath/", "http://www.adomain.com/apath/", true },
|
||||
{ "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
|
||||
{ "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/", false },
|
||||
{ "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/", true },
|
||||
};
|
||||
|
||||
for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
|
||||
{
|
||||
mu_assert ("test_are_urls_equal: wrong result",
|
||||
are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#endif /* TESTING */
|
||||
|
||||
/*
|
||||
* vim: et ts=2 sw=2
|
||||
*/
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user