mirror of
https://github.com/moparisthebest/wget
synced 2024-07-03 16:38:41 -04:00
[svn] Plug in new implementation of RES.
Published in <sxselmwddt0.fsf@florida.arsdigita.de>.
This commit is contained in:
parent
12c8ef1796
commit
05f90bb302
@ -1,3 +1,7 @@
|
||||
2001-05-12 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* res.c: New file. Implement all RES-related code here.
|
||||
|
||||
2001-11-18 Hrvoje Niksic <hniksic@arsdigita.com>
|
||||
|
||||
* version.c: Wget 1.7.1 is released.
|
||||
|
@ -64,8 +64,9 @@ SSL_OBJ = @SSL_OBJ@
|
||||
OBJ = $(ALLOCA) cmpt$o connect$o cookies$o fnmatch$o ftp$o \
|
||||
ftp-basic$o ftp-ls$o $(OPIE_OBJ) getopt$o hash$o \
|
||||
headers$o host$o html-parse$o html-url$o http$o init$o \
|
||||
log$o main$o $(MD5_OBJ) netrc$o rbuf$o recur$o retr$o \
|
||||
snprintf$o $(SSL_OBJ) url$o utils$o version$o safe-ctype$o
|
||||
log$o main$o $(MD5_OBJ) netrc$o rbuf$o recur$o res$o \
|
||||
retr$o safe-ctype$o snprintf$o $(SSL_OBJ) url$o \
|
||||
utils$o version$o
|
||||
|
||||
.SUFFIXES:
|
||||
.SUFFIXES: .c .o ._c ._o
|
||||
|
312
src/recur.c
312
src/recur.c
@ -42,6 +42,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
#include "fnmatch.h"
|
||||
#include "host.h"
|
||||
#include "hash.h"
|
||||
#include "res.h"
|
||||
|
||||
#ifndef errno
|
||||
extern int errno;
|
||||
@ -49,8 +50,6 @@ extern int errno;
|
||||
|
||||
extern char *version_string;
|
||||
|
||||
#define ROBOTS_FILENAME "robots.txt"
|
||||
|
||||
static struct hash_table *dl_file_url_map;
|
||||
static struct hash_table *dl_url_file_map;
|
||||
|
||||
@ -61,26 +60,14 @@ static slist *downloaded_html_files;
|
||||
/* List of undesirable-to-load URLs. */
|
||||
static struct hash_table *undesirable_urls;
|
||||
|
||||
/* List of forbidden locations. */
|
||||
static char **forbidden = NULL;
|
||||
|
||||
/* Current recursion depth. */
|
||||
static int depth;
|
||||
|
||||
/* Base directory we're recursing from (used by no_parent). */
|
||||
static char *base_dir;
|
||||
|
||||
/* The host name for which we last checked robots. */
|
||||
static char *robots_host;
|
||||
|
||||
static int first_time = 1;
|
||||
|
||||
/* Construct the robots URL. */
|
||||
static struct urlinfo *robots_url PARAMS ((const char *, const char *));
|
||||
static uerr_t retrieve_robots PARAMS ((const char *, const char *));
|
||||
static char **parse_robots PARAMS ((const char *));
|
||||
static int robots_match PARAMS ((struct urlinfo *, char **));
|
||||
|
||||
|
||||
/* Cleanup the data structures associated with recursive retrieving
|
||||
(the variables above). */
|
||||
@ -105,12 +92,9 @@ recursive_cleanup (void)
|
||||
dl_url_file_map = NULL;
|
||||
}
|
||||
undesirable_urls = NULL;
|
||||
free_vec (forbidden);
|
||||
forbidden = NULL;
|
||||
slist_free (downloaded_html_files);
|
||||
downloaded_html_files = NULL;
|
||||
FREE_MAYBE (base_dir);
|
||||
FREE_MAYBE (robots_host);
|
||||
first_time = 1;
|
||||
}
|
||||
|
||||
@ -137,9 +121,7 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
int meta_disallow_follow;
|
||||
int this_url_ftp; /* See below the explanation */
|
||||
uerr_t err;
|
||||
struct urlinfo *rurl;
|
||||
urlpos *url_list, *cur_url;
|
||||
char *rfile; /* For robots */
|
||||
struct urlinfo *u;
|
||||
|
||||
assert (this_url != NULL);
|
||||
@ -176,8 +158,6 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
}
|
||||
freeurl (u, 1);
|
||||
depth = 1;
|
||||
robots_host = NULL;
|
||||
forbidden = NULL;
|
||||
first_time = 0;
|
||||
}
|
||||
else
|
||||
@ -399,39 +379,31 @@ recursive_retrieve (const char *file, const char *this_url)
|
||||
/* What about robots.txt? */
|
||||
if (!inl && opt.use_robots && u->proto == URLHTTP)
|
||||
{
|
||||
/* Since Wget knows about only one set of robot rules at a
|
||||
time, /robots.txt must be reloaded whenever a new host is
|
||||
accessed.
|
||||
|
||||
robots_host holds the host the current `forbid' variable
|
||||
is assigned to. */
|
||||
if (!robots_host || !same_host (robots_host, u->host))
|
||||
struct robot_specs *specs = res_get_specs (u->host, u->port);
|
||||
if (!specs)
|
||||
{
|
||||
FREE_MAYBE (robots_host);
|
||||
/* Now make robots_host the new host, no matter what the
|
||||
result will be. So if there is no /robots.txt on the
|
||||
site, Wget will not retry getting robots all the
|
||||
time. */
|
||||
robots_host = xstrdup (u->host);
|
||||
free_vec (forbidden);
|
||||
forbidden = NULL;
|
||||
err = retrieve_robots (constr, ROBOTS_FILENAME);
|
||||
if (err == ROBOTSOK)
|
||||
char *rfile;
|
||||
if (res_retrieve_file (constr, &rfile))
|
||||
{
|
||||
rurl = robots_url (constr, ROBOTS_FILENAME);
|
||||
rfile = url_filename (rurl);
|
||||
forbidden = parse_robots (rfile);
|
||||
freeurl (rurl, 1);
|
||||
specs = res_parse_from_file (rfile);
|
||||
xfree (rfile);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* If we cannot get real specs, at least produce
|
||||
dummy ones so that we can register them and stop
|
||||
trying to retrieve them. */
|
||||
specs = res_parse ("", 0);
|
||||
}
|
||||
res_register_specs (u->host, u->port, specs);
|
||||
}
|
||||
|
||||
/* Now that we have (or don't have) robots, we can check for
|
||||
them. */
|
||||
if (!robots_match (u, forbidden))
|
||||
/* Now that we have (or don't have) robots.txt specs, we can
|
||||
check what they say. */
|
||||
if (!res_match_path (specs, u->path))
|
||||
{
|
||||
DEBUGP (("Stuffing %s because %s forbids it.\n", this_url,
|
||||
ROBOTS_FILENAME));
|
||||
DEBUGP (("Not following %s because robots.txt forbids it.\n",
|
||||
constr));
|
||||
string_set_add (undesirable_urls, constr);
|
||||
inl = 1;
|
||||
}
|
||||
@ -650,249 +622,3 @@ convert_all_links (void)
|
||||
free_urlpos (urls);
|
||||
}
|
||||
}
|
||||
|
||||
/* Robots support. */
|
||||
|
||||
/* Construct the robots URL. */
|
||||
static struct urlinfo *
|
||||
robots_url (const char *url, const char *robots_filename)
|
||||
{
|
||||
struct urlinfo *u = newurl ();
|
||||
uerr_t err;
|
||||
|
||||
err = parseurl (url, u, 0);
|
||||
assert (err == URLOK && u->proto == URLHTTP);
|
||||
xfree (u->file);
|
||||
xfree (u->dir);
|
||||
xfree (u->url);
|
||||
u->dir = xstrdup ("");
|
||||
u->file = xstrdup (robots_filename);
|
||||
u->url = str_url (u, 0);
|
||||
return u;
|
||||
}
|
||||
|
||||
/* Retrieves the robots_filename from the root server directory, if
|
||||
possible. Returns ROBOTSOK if robots were retrieved OK, and
|
||||
NOROBOTS if robots could not be retrieved for any reason. */
|
||||
static uerr_t
|
||||
retrieve_robots (const char *url, const char *robots_filename)
|
||||
{
|
||||
int dt;
|
||||
uerr_t err;
|
||||
struct urlinfo *u;
|
||||
|
||||
u = robots_url (url, robots_filename);
|
||||
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
|
||||
err = retrieve_url (u->url, NULL, NULL, NULL, &dt);
|
||||
freeurl (u, 1);
|
||||
if (err == RETROK)
|
||||
return ROBOTSOK;
|
||||
else
|
||||
return NOROBOTS;
|
||||
}
|
||||
|
||||
/* Parse the robots_filename and return the disallowed path components
|
||||
in a malloc-ed vector of character pointers.
|
||||
|
||||
It should be fully compliant with the syntax as described in the
|
||||
file norobots.txt, adopted by the robots mailing list
|
||||
(robots@webcrawler.com). */
|
||||
static char **
|
||||
parse_robots (const char *robots_filename)
|
||||
{
|
||||
FILE *fp;
|
||||
char **entries;
|
||||
char *line, *cmd, *str, *p;
|
||||
char *base_version, *version;
|
||||
int num, i;
|
||||
int wget_matched; /* is the part meant for Wget? */
|
||||
|
||||
entries = NULL;
|
||||
|
||||
num = 0;
|
||||
fp = fopen (robots_filename, "rb");
|
||||
if (!fp)
|
||||
return NULL;
|
||||
|
||||
/* Kill version number. */
|
||||
if (opt.useragent)
|
||||
{
|
||||
STRDUP_ALLOCA (base_version, opt.useragent);
|
||||
STRDUP_ALLOCA (version, opt.useragent);
|
||||
}
|
||||
else
|
||||
{
|
||||
int len = 10 + strlen (version_string);
|
||||
base_version = (char *)alloca (len);
|
||||
sprintf (base_version, "Wget/%s", version_string);
|
||||
version = (char *)alloca (len);
|
||||
sprintf (version, "Wget/%s", version_string);
|
||||
}
|
||||
for (p = version; *p; p++)
|
||||
*p = TOLOWER (*p);
|
||||
for (p = base_version; *p && *p != '/'; p++)
|
||||
*p = TOLOWER (*p);
|
||||
*p = '\0';
|
||||
|
||||
/* Setting this to 1 means that Wget considers itself under
|
||||
restrictions by default, even if the User-Agent field is not
|
||||
present. However, if it finds the user-agent set to anything
|
||||
other than Wget, the rest will be ignored (up to the following
|
||||
User-Agent field). Thus you may have something like:
|
||||
|
||||
Disallow: 1
|
||||
Disallow: 2
|
||||
User-Agent: stupid-robot
|
||||
Disallow: 3
|
||||
Disallow: 4
|
||||
User-Agent: Wget*
|
||||
Disallow: 5
|
||||
Disallow: 6
|
||||
User-Agent: *
|
||||
Disallow: 7
|
||||
|
||||
In this case the 1, 2, 5, 6 and 7 disallow lines will be
|
||||
stored. */
|
||||
wget_matched = 1;
|
||||
while ((line = read_whole_line (fp)))
|
||||
{
|
||||
int len = strlen (line);
|
||||
/* Destroy <CR><LF> if present. */
|
||||
if (len && line[len - 1] == '\n')
|
||||
line[--len] = '\0';
|
||||
if (len && line[len - 1] == '\r')
|
||||
line[--len] = '\0';
|
||||
/* According to specifications, optional space may be at the
|
||||
end... */
|
||||
DEBUGP (("Line: %s\n", line));
|
||||
/* Skip spaces. */
|
||||
for (cmd = line; *cmd && ISSPACE (*cmd); cmd++);
|
||||
if (!*cmd)
|
||||
{
|
||||
xfree (line);
|
||||
DEBUGP (("(chucked out)\n"));
|
||||
continue;
|
||||
}
|
||||
/* Look for ':'. */
|
||||
for (str = cmd; *str && *str != ':'; str++);
|
||||
if (!*str)
|
||||
{
|
||||
xfree (line);
|
||||
DEBUGP (("(chucked out)\n"));
|
||||
continue;
|
||||
}
|
||||
/* Zero-terminate the command. */
|
||||
*str++ = '\0';
|
||||
/* Look for the string beginning... */
|
||||
for (; *str && ISSPACE (*str); str++);
|
||||
/* Look for comments or trailing spaces and kill them off. */
|
||||
for (p = str; *p; p++)
|
||||
if (*p && ISSPACE (*p) && ((*(p + 1) == '#') || (*(p + 1) == '\0')))
|
||||
{
|
||||
/* We have found either a shell-style comment `<sp>+#' or some
|
||||
trailing spaces. Now rewind to the beginning of the spaces
|
||||
and place '\0' there. */
|
||||
while (p > str && ISSPACE (*p))
|
||||
--p;
|
||||
if (p == str)
|
||||
*p = '\0';
|
||||
else
|
||||
*(p + 1) = '\0';
|
||||
break;
|
||||
}
|
||||
if (!strcasecmp (cmd, "User-agent"))
|
||||
{
|
||||
int match = 0;
|
||||
/* Lowercase the agent string. */
|
||||
for (p = str; *p; p++)
|
||||
*p = TOLOWER (*p);
|
||||
/* If the string is `*', it matches. */
|
||||
if (*str == '*' && !*(str + 1))
|
||||
match = 1;
|
||||
else
|
||||
{
|
||||
/* If the string contains wildcards, we'll run it through
|
||||
fnmatch(). */
|
||||
if (has_wildcards_p (str))
|
||||
{
|
||||
/* If the string contains '/', compare with the full
|
||||
version. Else, compare it to base_version. */
|
||||
if (strchr (str, '/'))
|
||||
match = !fnmatch (str, version, 0);
|
||||
else
|
||||
match = !fnmatch (str, base_version, 0);
|
||||
}
|
||||
else /* Substring search */
|
||||
{
|
||||
if (strstr (version, str))
|
||||
match = 1;
|
||||
else
|
||||
match = 0;
|
||||
}
|
||||
}
|
||||
/* If Wget is not matched, skip all the entries up to the
|
||||
next User-agent field. */
|
||||
wget_matched = match;
|
||||
}
|
||||
else if (!wget_matched)
|
||||
{
|
||||
xfree (line);
|
||||
DEBUGP (("(chucking out since it is not applicable for Wget)\n"));
|
||||
continue;
|
||||
}
|
||||
else if (!strcasecmp (cmd, "Disallow"))
|
||||
{
|
||||
/* If "Disallow" is empty, the robot is welcome. */
|
||||
if (!*str)
|
||||
{
|
||||
free_vec (entries);
|
||||
entries = (char **)xmalloc (sizeof (char *));
|
||||
*entries = NULL;
|
||||
num = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
entries = (char **)xrealloc (entries, (num + 2)* sizeof (char *));
|
||||
entries[num] = xstrdup (str);
|
||||
entries[++num] = NULL;
|
||||
/* Strip trailing spaces, according to specifications. */
|
||||
for (i = strlen (str); i >= 0 && ISSPACE (str[i]); i--)
|
||||
if (ISSPACE (str[i]))
|
||||
str[i] = '\0';
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* unknown command */
|
||||
DEBUGP (("(chucked out)\n"));
|
||||
}
|
||||
xfree (line);
|
||||
}
|
||||
fclose (fp);
|
||||
return entries;
|
||||
}
|
||||
|
||||
/* May the URL url be loaded according to disallowing rules stored in
|
||||
forbidden? */
|
||||
static int
|
||||
robots_match (struct urlinfo *u, char **fb)
|
||||
{
|
||||
int l;
|
||||
|
||||
if (!fb)
|
||||
return 1;
|
||||
DEBUGP (("Matching %s against: ", u->path));
|
||||
for (; *fb; fb++)
|
||||
{
|
||||
DEBUGP (("%s ", *fb));
|
||||
l = strlen (*fb);
|
||||
/* If dir is fb, we may not load the file. */
|
||||
if (strncmp (u->path, *fb, l) == 0)
|
||||
{
|
||||
DEBUGP (("matched.\n"));
|
||||
return 0; /* Matches, i.e. does not load... */
|
||||
}
|
||||
}
|
||||
DEBUGP (("not matched.\n"));
|
||||
return 1;
|
||||
}
|
||||
|
544
src/res.c
Normal file
544
src/res.c
Normal file
@ -0,0 +1,544 @@
|
||||
/* Support for Robot Exclusion Standard (RES).
|
||||
Copyright (C) 2001 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of Wget.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or (at
|
||||
your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
|
||||
/* This file implements the Robot Exclusion Standard (RES).
|
||||
|
||||
RES is a simple protocol that enables site admins to signalize to
|
||||
the web crawlers that certain parts of the site should not be
|
||||
accessed. All the admin needs to do is create a "robots.txt" file
|
||||
in the web server root, and use simple commands to allow or
|
||||
disallow access to certain parts of the site.
|
||||
|
||||
The first specification was written by Martijn Koster in 1994, and
|
||||
is still available at <http://www.robotstxt.org/wc/norobots.html>.
|
||||
In 1996, Martijn wrote an Internet Draft specifying an improved RES
|
||||
specification; however, that work was apparently abandoned since
|
||||
the draft has expired in 1997 and hasn't been replaced since. The
|
||||
draft is available at
|
||||
<http://www.robotstxt.org/wc/norobots-rfc.html>.
|
||||
|
||||
This file implements RES as specified by the draft. Note that this
|
||||
only handles the "robots.txt" support. The META tag that controls
|
||||
whether the links should be followed is handled in `html-url.c'.
|
||||
|
||||
Known deviations:
|
||||
|
||||
* The end-of-line comment recognition is more in the spirit of the
|
||||
Bourne Shell (as specified by RES-1994). That means that
|
||||
"foo#bar" is taken literally, whereas "foo #bar" is interpreted
|
||||
as "foo". The Draft apparently specifies that both should be
|
||||
interpreted as "foo".
|
||||
|
||||
* We don't recognize sole CR as the line ending.
|
||||
|
||||
* We don't implement expiry mechanism for /robots.txt specs. I
|
||||
consider it non-necessary for a relatively short-lived
|
||||
application such as Wget. Besides, it is highly questionable
|
||||
whether anyone deploys the recommended expiry scheme for
|
||||
robots.txt.
|
||||
|
||||
Entry points are functions res_parse, res_parse_from_file,
|
||||
res_match_path, res_register_specs, res_get_specs, and
|
||||
res_retrieve_file. */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include <config.h>
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef HAVE_STRING_H
|
||||
# include <string.h>
|
||||
#else
|
||||
# include <strings.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "wget.h"
|
||||
#include "utils.h"
|
||||
#include "hash.h"
|
||||
#include "url.h"
|
||||
#include "retr.h"
|
||||
#include "res.h"
|
||||
|
||||
struct path_info {
|
||||
char *path;
|
||||
int allowedp;
|
||||
int user_agent_exact_p;
|
||||
};
|
||||
|
||||
struct robot_specs {
|
||||
int count;
|
||||
int size;
|
||||
struct path_info *paths;
|
||||
};
|
||||
|
||||
/* Parsing the robot spec. */
|
||||
|
||||
/* Check whether AGENT (a string of length LENGTH) equals "wget" or
|
||||
"*". If it is either of them, *matches is set to one. If it is
|
||||
"wget", *exact_match is set to one. */
|
||||
|
||||
static void
|
||||
match_user_agent (const char *agent, int length,
|
||||
int *matches, int *exact_match)
|
||||
{
|
||||
if (length == 1 && *agent == '*')
|
||||
{
|
||||
*matches = 1;
|
||||
*exact_match = 0;
|
||||
}
|
||||
else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
|
||||
{
|
||||
*matches = 1;
|
||||
*exact_match = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
*matches = 0;
|
||||
*exact_match = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Add a path specification between PATH_B and PATH_E as one of the
|
||||
paths in SPECS. */
|
||||
|
||||
static void
|
||||
add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
|
||||
int allowedp, int exactp)
|
||||
{
|
||||
struct path_info pp;
|
||||
pp.path = strdupdelim (path_b, path_e);
|
||||
pp.allowedp = allowedp;
|
||||
pp.user_agent_exact_p = exactp;
|
||||
++specs->count;
|
||||
if (specs->count > specs->size)
|
||||
{
|
||||
if (specs->size == 0)
|
||||
specs->size = 1;
|
||||
else
|
||||
specs->size <<= 1;
|
||||
specs->paths = xrealloc (specs->paths,
|
||||
specs->size * sizeof (struct path_info));
|
||||
}
|
||||
specs->paths[specs->count - 1] = pp;
|
||||
}
|
||||
|
||||
/* Recreate SPECS->paths with only those paths that have non-zero
|
||||
user_agent_exact_p. */
|
||||
|
||||
static void
|
||||
prune_non_exact (struct robot_specs *specs)
|
||||
{
|
||||
struct path_info *newpaths;
|
||||
int i, j, cnt;
|
||||
cnt = 0;
|
||||
for (i = 0; i < specs->count; i++)
|
||||
if (specs->paths[i].user_agent_exact_p)
|
||||
++cnt;
|
||||
newpaths = xmalloc (cnt * sizeof (struct path_info));
|
||||
for (i = 0, j = 0; i < specs->count; i++)
|
||||
if (specs->paths[i].user_agent_exact_p)
|
||||
newpaths[j++] = specs->paths[i];
|
||||
assert (j == cnt);
|
||||
xfree (specs->paths);
|
||||
specs->paths = newpaths;
|
||||
specs->count = cnt;
|
||||
specs->size = cnt;
|
||||
}
|
||||
|
||||
#define EOL(p) ((p) >= lineend)
|
||||
|
||||
#define SKIP_SPACE(p) do { \
|
||||
while (!EOL (p) && ISSPACE (*p)) \
|
||||
++p; \
|
||||
} while (0)
|
||||
|
||||
#define FIELD_IS(string_literal) \
|
||||
BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
|
||||
|
||||
/* Parse textual RES specs beginning with SOURCE of length LENGTH.
|
||||
Return a specs objects ready to be fed to res_match_path.
|
||||
|
||||
The parsing itself is trivial, but creating a correct SPECS object
|
||||
is trickier than it seems, because RES is surprisingly byzantine if
|
||||
you attempt to implement it correctly.
|
||||
|
||||
A "record" is a block of one or more `User-Agent' lines followed by
|
||||
one or more `Allow' or `Disallow' lines. Record is accepted by
|
||||
Wget if one of the `User-Agent' lines was "wget", or if the user
|
||||
agent line was "*".
|
||||
|
||||
After all the lines have been read, we examine whether an exact
|
||||
("wget") user-agent field was specified. If so, we delete all the
|
||||
lines read under "User-Agent: *" blocks because we have our own
|
||||
Wget-specific blocks. This enables the admin to say:
|
||||
|
||||
User-Agent: *
|
||||
Disallow: /
|
||||
|
||||
User-Agent: google
|
||||
User-Agent: wget
|
||||
Disallow: /cgi-bin
|
||||
|
||||
This means that to Wget and to Google, /cgi-bin is disallowed,
|
||||
whereas for all other crawlers, everything is disallowed.
|
||||
res_parse is implemented so that the order of records doesn't
|
||||
matter. In the case above, the "User-Agent: *" could have come
|
||||
after the other one. */
|
||||
|
||||
struct robot_specs *
|
||||
res_parse (const char *source, int length)
|
||||
{
|
||||
int line_count = 1;
|
||||
|
||||
const char *p = source;
|
||||
const char *end = source + length;
|
||||
|
||||
/* non-zero if last applicable user-agent field matches Wget. */
|
||||
int user_agent_applies = 0;
|
||||
|
||||
/* non-zero if last applicable user-agent field *exactly* matches
|
||||
Wget. */
|
||||
int user_agent_exact = 0;
|
||||
|
||||
/* whether we ever encountered exact user agent. */
|
||||
int found_exact = 0;
|
||||
|
||||
/* count of allow/disallow lines in the current "record", i.e. after
|
||||
the last `user-agent' instructions. */
|
||||
int record_count = 0;
|
||||
|
||||
struct robot_specs *specs = xmalloc (sizeof (struct robot_specs));
|
||||
memset (specs, '\0', sizeof (struct robot_specs));
|
||||
|
||||
while (1)
|
||||
{
|
||||
const char *lineend, *lineend_real;
|
||||
const char *field_b, *field_e;
|
||||
const char *value_b, *value_e;
|
||||
|
||||
if (p == end)
|
||||
break;
|
||||
lineend_real = memchr (p, '\n', end - p);
|
||||
if (lineend_real)
|
||||
++lineend_real;
|
||||
else
|
||||
lineend_real = end;
|
||||
lineend = lineend_real;
|
||||
|
||||
/* Before doing anything else, check whether the line is empty
|
||||
or comment-only. */
|
||||
SKIP_SPACE (p);
|
||||
if (EOL (p) || *p == '#')
|
||||
goto next;
|
||||
|
||||
/* Make sure the end-of-line comments are respected by setting
|
||||
lineend to a location preceding the first comment. Real line
|
||||
ending remains in lineend_real. */
|
||||
for (lineend = p; lineend < lineend_real; lineend++)
|
||||
if ((lineend == p || ISSPACE (*(lineend - 1)))
|
||||
&& *lineend == '#')
|
||||
break;
|
||||
|
||||
/* Ignore trailing whitespace in the same way. */
|
||||
while (lineend > p && ISSPACE (*(lineend - 1)))
|
||||
--lineend;
|
||||
|
||||
assert (!EOL (p));
|
||||
|
||||
field_b = p;
|
||||
while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
|
||||
++p;
|
||||
field_e = p;
|
||||
|
||||
SKIP_SPACE (p);
|
||||
if (field_b == field_e || EOL (p) || *p != ':')
|
||||
{
|
||||
DEBUGP (("Ignoring malformed line %d", line_count));
|
||||
goto next;
|
||||
}
|
||||
++p; /* skip ':' */
|
||||
SKIP_SPACE (p);
|
||||
|
||||
value_b = p;
|
||||
while (!EOL (p))
|
||||
++p;
|
||||
value_e = p;
|
||||
|
||||
/* Finally, we have a syntactically valid line. */
|
||||
if (FIELD_IS ("user-agent"))
|
||||
{
|
||||
/* We have to support several cases:
|
||||
|
||||
--previous records--
|
||||
|
||||
User-Agent: foo
|
||||
User-Agent: Wget
|
||||
User-Agent: bar
|
||||
... matching record ...
|
||||
|
||||
User-Agent: baz
|
||||
User-Agent: qux
|
||||
... non-matching record ...
|
||||
|
||||
User-Agent: *
|
||||
... matching record, but will be pruned later ...
|
||||
|
||||
We have to respect `User-Agent' at the beginning of each
|
||||
new record simply because we don't know if we're going to
|
||||
encounter "Wget" among the agents or not. Hence,
|
||||
match_user_agent is called when record_count != 0.
|
||||
|
||||
But if record_count is 0, we have to keep calling it
|
||||
until it matches, and if that happens, we must not call
|
||||
it any more, until the next record. Hence the other part
|
||||
of the condition. */
|
||||
if (record_count != 0 || user_agent_applies == 0)
|
||||
match_user_agent (value_b, value_e - value_b,
|
||||
&user_agent_applies, &user_agent_exact);
|
||||
if (user_agent_exact)
|
||||
found_exact = 1;
|
||||
record_count = 0;
|
||||
}
|
||||
else if (FIELD_IS ("allow"))
|
||||
{
|
||||
if (user_agent_applies)
|
||||
{
|
||||
add_path (specs, value_b, value_e, 1, user_agent_exact);
|
||||
}
|
||||
++record_count;
|
||||
}
|
||||
else if (FIELD_IS ("disallow"))
|
||||
{
|
||||
if (user_agent_applies)
|
||||
{
|
||||
int allowed = 0;
|
||||
if (value_b == value_e)
|
||||
/* Empty "disallow" line means everything is
|
||||
*allowed*! */
|
||||
allowed = 1;
|
||||
add_path (specs, value_b, value_e, allowed, user_agent_exact);
|
||||
}
|
||||
++record_count;
|
||||
}
|
||||
else
|
||||
{
|
||||
DEBUGP (("Ignoring unknown field at line %d", line_count));
|
||||
goto next;
|
||||
}
|
||||
|
||||
next:
|
||||
p = lineend_real;
|
||||
++line_count;
|
||||
}
|
||||
|
||||
if (found_exact)
|
||||
{
|
||||
/* We've encountered an exactly matching user-agent. Throw out
|
||||
all the stuff with user-agent: *. */
|
||||
prune_non_exact (specs);
|
||||
}
|
||||
else if (specs->size > specs->count)
|
||||
{
|
||||
/* add_path normally over-allocates specs->paths. Reallocate it
|
||||
to the correct size in order to conserve some memory. */
|
||||
specs->paths = xrealloc (specs->paths,
|
||||
specs->count * sizeof (struct path_info));
|
||||
specs->size = specs->count;
|
||||
}
|
||||
|
||||
return specs;
|
||||
}
|
||||
|
||||
/* The same like res_parse, but first map the FILENAME into memory,
|
||||
and then parse it. */
|
||||
|
||||
struct robot_specs *
|
||||
res_parse_from_file (const char *filename)
|
||||
{
|
||||
struct robot_specs *specs;
|
||||
struct file_memory *fm = read_file (filename);
|
||||
if (!fm)
|
||||
{
|
||||
logprintf (LOG_NOTQUIET, "Cannot open %s: %s",
|
||||
filename, strerror (errno));
|
||||
return NULL;
|
||||
}
|
||||
specs = res_parse (fm->content, fm->length);
|
||||
read_file_free (fm);
|
||||
return specs;
|
||||
}
|
||||
|
||||
static void
|
||||
free_specs (struct robot_specs *specs)
|
||||
{
|
||||
FREE_MAYBE (specs->paths);
|
||||
xfree (specs);
|
||||
}
|
||||
|
||||
/* Matching of a path according to the specs. */
|
||||
|
||||
/* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
|
||||
that number is not a numerical representation of '/', decode C and
|
||||
advance the pointer. */
|
||||
|
||||
#define DECODE_MAYBE(c, ptr) do { \
|
||||
if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
|
||||
{ \
|
||||
char decoded \
|
||||
= (XCHAR_TO_XDIGIT (ptr[1]) << 4) + XCHAR_TO_XDIGIT (ptr[2]); \
|
||||
if (decoded != '/') \
|
||||
{ \
|
||||
c = decoded; \
|
||||
ptr += 2; \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/* The inner matching engine: return non-zero if RECORD_PATH matches
|
||||
URL_PATH. The rules for matching are described at
|
||||
<http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html>,
|
||||
section 3.2.2. */
|
||||
|
||||
static int
|
||||
matches (const char *record_path, const char *url_path)
|
||||
{
|
||||
const char *rp = record_path;
|
||||
const char *up = url_path;
|
||||
|
||||
for (; ; ++rp, ++up)
|
||||
{
|
||||
char rc = *rp;
|
||||
char uc = *up;
|
||||
if (!rc)
|
||||
return 1;
|
||||
if (!uc)
|
||||
return 0;
|
||||
DECODE_MAYBE(rc, rp);
|
||||
DECODE_MAYBE(uc, up);
|
||||
if (rc != uc)
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Iterate through all paths in SPECS. For the first one that
|
||||
matches, return its allow/reject status. If none matches,
|
||||
retrieval is by default allowed. */
|
||||
|
||||
int
|
||||
res_match_path (const struct robot_specs *specs, const char *path)
|
||||
{
|
||||
int i;
|
||||
if (!specs)
|
||||
return 1;
|
||||
for (i = 0; i < specs->count; i++)
|
||||
if (matches (specs->paths[i].path, path))
|
||||
{
|
||||
int allowedp = specs->paths[i].allowedp;
|
||||
DEBUGP (("%s path %s because of rule `%s'.\n",
|
||||
allowedp ? "Allowing" : "Rejecting",
|
||||
path, specs->paths[i].path));
|
||||
return allowedp;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Registering the specs. */
|
||||
|
||||
struct hash_table *registered_specs;
|
||||
|
||||
/* Stolen from cookies.c. */
|
||||
#define SET_HOSTPORT(host, port, result) do { \
|
||||
int HP_len = strlen (host); \
|
||||
result = alloca (HP_len + 1 + numdigit (port) + 1); \
|
||||
memcpy (result, host, HP_len); \
|
||||
result[HP_len] = ':'; \
|
||||
long_to_string (result + HP_len + 1, port); \
|
||||
} while (0)
|
||||
|
||||
/* Register RES specs that below to server on HOST:PORT. They will
|
||||
later be retrievable using res_get_specs. */
|
||||
|
||||
void
|
||||
res_register_specs (const char *host, int port, struct robot_specs *specs)
|
||||
{
|
||||
struct robot_specs *old;
|
||||
char *hp, *hp_old;
|
||||
SET_HOSTPORT (host, port, hp);
|
||||
|
||||
if (!registered_specs)
|
||||
registered_specs = make_nocase_string_hash_table (0);
|
||||
|
||||
if (hash_table_get_pair (registered_specs, hp, hp_old, old))
|
||||
{
|
||||
if (old)
|
||||
free_specs (old);
|
||||
hash_table_put (registered_specs, hp_old, specs);
|
||||
}
|
||||
else
|
||||
{
|
||||
hash_table_put (registered_specs, xstrdup (hp), specs);
|
||||
}
|
||||
}
|
||||
|
||||
/* Get the specs that belong to HOST:PORT. */
|
||||
|
||||
struct robot_specs *
|
||||
res_get_specs (const char *host, int port)
|
||||
{
|
||||
char *hp;
|
||||
SET_HOSTPORT (host, port, hp);
|
||||
if (!registered_specs)
|
||||
return NULL;
|
||||
return hash_table_get (registered_specs, hp);
|
||||
}
|
||||
|
||||
/* Loading the robots file. */
|
||||
|
||||
#define RES_SPECS_LOCATION "/robots.txt"
|
||||
|
||||
/* Retrieve the robots.txt from the server root of the server that
|
||||
serves URL. The file will be named according to the currently
|
||||
active rules, and the file name will be returned in *file.
|
||||
|
||||
Return non-zero if robots were retrieved OK, zero otherwise. */
|
||||
|
||||
int
|
||||
res_retrieve_file (const char *url, char **file)
|
||||
{
|
||||
uerr_t err;
|
||||
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
|
||||
|
||||
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
|
||||
*file = NULL;
|
||||
err = retrieve_url (robots_url, file, NULL, NULL, NULL);
|
||||
xfree (robots_url);
|
||||
|
||||
if (err != RETROK && *file != NULL)
|
||||
{
|
||||
/* If the file is not retrieved correctly, but retrieve_url
|
||||
allocated the file name, deallocate is here so that the
|
||||
caller doesn't have to worry about it. */
|
||||
xfree (*file);
|
||||
*file = NULL;
|
||||
}
|
||||
return err == RETROK;
|
||||
}
|
31
src/res.h
Normal file
31
src/res.h
Normal file
@ -0,0 +1,31 @@
|
||||
/* Declarations for res.c.
|
||||
Copyright (C) 2001 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of Wget.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
||||
|
||||
struct robot_specs;
|
||||
|
||||
struct robot_specs *res_parse PARAMS ((const char *, int));
|
||||
struct robot_specs *res_parse_from_file PARAMS ((const char *));
|
||||
|
||||
int res_match_path PARAMS ((const struct robot_specs *, const char *));
|
||||
|
||||
void res_register_specs PARAMS ((const char *, int, struct robot_specs *));
|
||||
struct robot_specs *res_get_specs PARAMS ((const char *, int));
|
||||
|
||||
int res_retrieve_file PARAMS ((const char *, char **));
|
||||
|
Loading…
Reference in New Issue
Block a user