1
0
mirror of https://github.com/moparisthebest/wget synced 2024-07-03 16:38:41 -04:00

[svn] Fixes for recursive spider mode.

This commit is contained in:
mtortonesi 2006-06-28 04:09:30 -07:00
parent 79f66dfd15
commit 60c88ee992
7 changed files with 845 additions and 676 deletions

View File

@ -1,3 +1,22 @@
2006-06-28 Mauro Tortonesi <mauro@ferrara.linux.it>
* res.c: Implemented is_robots_txt_url function for detection of
robots.txt URLs and related test routine.
* res.h: Ditto.
* url.c: Implemented are_urls_equal function for URL comparison and
related testing routine.
* url.h: Ditto.
* convert.c: Fixes for recursive spider mode: don't consider
non-existing robots.txt as a broken link, and use are_urls_equal
instead of strcasecmp for referrer URLs comparison.
* test.c: Call tests routines for are_urls_equal and
is_robots_txt_url.
2006-06-26 Hrvoje Niksic <hniksic@xemacs.org>
* wget.h (wgint): Typedef to any 64-bit (or larger) type we can

View File

@ -45,6 +45,7 @@ so, delete this exception statement from your version. */
#include "utils.h"
#include "hash.h"
#include "ptimer.h"
#include "res.h"
static struct hash_table *dl_file_url_map;
struct hash_table *dl_url_file_map;
@ -972,8 +973,8 @@ in_list (const struct broken_urls_list *list, const char *url)
for (ptr = list; ptr; ptr = ptr->next)
{
/* TODO: strcasecmp may not be appropriate to compare URLs */
if (strcasecmp (url, ptr->url) == 0) return true;
/* str[case]cmp is inadequate for URL comparison */
if (are_urls_equal (url, ptr->url) == 0) return true;
}
return false;
@ -984,6 +985,10 @@ nonexisting_url (const char *url, const char *referrer)
{
struct broken_urls_list *list;
/* Ignore robots.txt URLs */
if (is_robots_txt_url (url))
return;
if (!nonexisting_urls_hash)
nonexisting_urls_hash = make_string_hash_table (0);
@ -1140,3 +1145,8 @@ html_quote_string (const char *s)
*p = '\0';
return res;
}
/*
* vim: et ts=2 sw=2
*/

View File

@ -84,6 +84,10 @@ so, delete this exception statement from your version. */
#include "retr.h"
#include "res.h"
#ifdef TESTING
#include "test.h"
#endif
struct path_info {
char *path;
bool allowedp;
@ -552,6 +556,17 @@ res_retrieve_file (const char *url, char **file)
return err == RETROK;
}
bool
is_robots_txt_url (const char *url)
{
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
bool ret = are_urls_equal (url, robots_url);
xfree (robots_url);
return ret;
}
void
res_cleanup (void)
{
@ -569,3 +584,34 @@ res_cleanup (void)
registered_specs = NULL;
}
}
#ifdef TESTING
const char *
test_is_robots_txt_url()
{
int i;
struct {
char *url;
bool expected_result;
} test_array[] = {
{ "http://www.yoyodyne.com/robots.txt", true },
{ "http://www.yoyodyne.com/somepath/", false },
{ "http://www.yoyodyne.com/somepath/robots.txt", false },
};
for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
{
mu_assert ("test_is_robots_txt_url: wrong result",
is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
}
return NULL;
}
#endif /* TESTING */
/*
* vim: et ts=2 sw=2
*/

View File

@ -42,6 +42,8 @@ struct robot_specs *res_get_specs (const char *, int);
bool res_retrieve_file (const char *, char **);
bool is_robots_txt_url (const char *);
void res_cleanup (void);
#endif /* RES_H */

View File

@ -40,6 +40,8 @@ const char *test_subdir_p();
const char *test_dir_matches_p();
const char *test_cmd_spec_restrict_file_names();
const char *test_append_uri_pathel();
const char *test_are_urls_equal();
const char *test_is_robots_txt_url();
int tests_run;
@ -51,6 +53,8 @@ all_tests()
mu_run_test (test_dir_matches_p);
mu_run_test (test_cmd_spec_restrict_file_names);
mu_run_test (test_append_uri_pathel);
mu_run_test (test_are_urls_equal);
mu_run_test (test_is_robots_txt_url);
return NULL;
}

View File

@ -1926,6 +1926,64 @@ schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
return false;
}
static int
getchar_from_escaped_string (const char *str, char *c)
{
const char *p = str;
assert (str && *str);
assert (c);
if (p[0] == '%')
{
if (p[1] == 0)
return 0; /* error: invalid string */
if (p[1] == '%')
{
*c = '%';
return 1;
}
else
{
if (p[2] == 0)
return 0; /* error: invalid string */
*c = X2DIGITS_TO_NUM (p[1], p[2]);
return 3;
}
}
else
{
*c = p[0];
}
return 1;
}
bool
are_urls_equal (const char *u1, const char *u2)
{
const char *p, *q;
int pp, qq;
char ch1, ch2;
p = u1;
q = u2;
while (*p
&& (pp = getchar_from_escaped_string (p, &ch1))
&& (qq = getchar_from_escaped_string (q, &ch2))
&& (TOLOWER(ch1) == TOLOWER(ch2)))
{
p += pp;
q += qq;
}
return (*p == 0 && *q == 0 ? true : false);
}
#if 0
/* Debugging and testing support for path_simplify. */
@ -2036,5 +2094,33 @@ test_append_uri_pathel()
return NULL;
}
const char*
test_are_urls_equal()
{
int i;
struct {
char *url1;
char *url2;
bool expected_result;
} test_array[] = {
{ "http://www.adomain.com/apath/", "http://www.adomain.com/apath/", true },
{ "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
{ "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/", false },
{ "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/", true },
};
for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
{
mu_assert ("test_are_urls_equal: wrong result",
are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
}
return NULL;
}
#endif /* TESTING */
/*
* vim: et ts=2 sw=2
*/

View File

@ -97,4 +97,6 @@ int mkalldirs (const char *);
char *rewrite_shorthand_url (const char *);
bool schemes_are_similar_p (enum url_scheme a, enum url_scheme b);
bool are_urls_equal (const char *u1, const char *u2);
#endif /* URL_H */