1999-12-02 02:42:23 -05:00
|
|
|
/* Dealing with host names.
|
2000-11-19 15:50:10 -05:00
|
|
|
Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
|
1999-12-02 02:42:23 -05:00
|
|
|
|
2001-05-27 15:35:15 -04:00
|
|
|
This file is part of GNU Wget.
|
1999-12-02 02:42:23 -05:00
|
|
|
|
2001-05-27 15:35:15 -04:00
|
|
|
GNU Wget is free software; you can redistribute it and/or modify
|
1999-12-02 02:42:23 -05:00
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
2001-05-27 15:35:15 -04:00
|
|
|
GNU Wget is distributed in the hope that it will be useful,
|
1999-12-02 02:42:23 -05:00
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
2001-05-27 15:35:15 -04:00
|
|
|
along with Wget; if not, write to the Free Software
|
1999-12-02 02:42:23 -05:00
|
|
|
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
|
|
|
|
|
|
|
#include <config.h>
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#ifdef HAVE_STRING_H
|
|
|
|
# include <string.h>
|
|
|
|
#else
|
|
|
|
# include <strings.h>
|
|
|
|
#endif
|
|
|
|
#include <assert.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
|
|
|
|
#ifdef WINDOWS
|
|
|
|
# include <winsock.h>
|
|
|
|
#else
|
|
|
|
# include <sys/socket.h>
|
|
|
|
# include <netinet/in.h>
|
2001-11-16 11:58:08 -05:00
|
|
|
#ifndef __BEOS__
|
1999-12-02 02:42:23 -05:00
|
|
|
# include <arpa/inet.h>
|
2001-11-16 11:58:08 -05:00
|
|
|
#endif
|
1999-12-02 02:42:23 -05:00
|
|
|
# include <netdb.h>
|
|
|
|
#endif /* WINDOWS */
|
|
|
|
|
2001-11-16 11:58:08 -05:00
|
|
|
#ifndef NO_ADDRESS
|
|
|
|
#define NO_ADDRESS NO_DATA
|
|
|
|
#endif
|
|
|
|
|
1999-12-02 02:42:23 -05:00
|
|
|
#ifdef HAVE_SYS_UTSNAME_H
|
|
|
|
# include <sys/utsname.h>
|
|
|
|
#endif
|
|
|
|
#include <errno.h>
|
|
|
|
|
|
|
|
#include "wget.h"
|
|
|
|
#include "utils.h"
|
|
|
|
#include "host.h"
|
|
|
|
#include "url.h"
|
2000-11-19 15:50:10 -05:00
|
|
|
#include "hash.h"
|
1999-12-02 02:42:23 -05:00
|
|
|
|
|
|
|
#ifndef errno
|
|
|
|
extern int errno;
|
|
|
|
#endif
|
|
|
|
|
2000-11-19 15:50:10 -05:00
|
|
|
/* Mapping between all known hosts to their addresses (n.n.n.n). */
|
|
|
|
struct hash_table *host_name_address_map;
|
|
|
|
|
|
|
|
/* Mapping between all known addresses (n.n.n.n) to their hosts. This
|
|
|
|
is the inverse of host_name_address_map. These two tables share
|
|
|
|
the strdup'ed strings. */
|
|
|
|
struct hash_table *host_address_name_map;
|
|
|
|
|
|
|
|
/* Mapping between auxilliary (slave) and master host names. */
|
|
|
|
struct hash_table *host_slave_master_map;
|
|
|
|
|
|
|
|
/* Utility function: like xstrdup(), but also lowercases S. */
|
|
|
|
|
|
|
|
static char *
|
|
|
|
xstrdup_lower (const char *s)
|
1999-12-02 02:42:23 -05:00
|
|
|
{
|
2000-11-19 15:50:10 -05:00
|
|
|
char *copy = xstrdup (s);
|
|
|
|
char *p = copy;
|
|
|
|
for (; *p; p++)
|
|
|
|
*p = TOLOWER (*p);
|
|
|
|
return copy;
|
|
|
|
}
|
1999-12-02 02:42:23 -05:00
|
|
|
|
|
|
|
/* The same as gethostbyname, but supports internet addresses of the
|
2000-11-19 15:50:10 -05:00
|
|
|
form `N.N.N.N'. On some systems gethostbyname() knows how to do
|
|
|
|
this automatically. */
|
1999-12-02 02:42:23 -05:00
|
|
|
struct hostent *
|
|
|
|
ngethostbyname (const char *name)
|
|
|
|
{
|
|
|
|
struct hostent *hp;
|
|
|
|
unsigned long addr;
|
|
|
|
|
|
|
|
addr = (unsigned long)inet_addr (name);
|
|
|
|
if ((int)addr != -1)
|
|
|
|
hp = gethostbyaddr ((char *)&addr, sizeof (addr), AF_INET);
|
|
|
|
else
|
|
|
|
hp = gethostbyname (name);
|
|
|
|
return hp;
|
|
|
|
}
|
|
|
|
|
2000-11-19 15:50:10 -05:00
|
|
|
/* Add host name HOST with the address ADDR_TEXT to the cache.
|
|
|
|
Normally this means that the (HOST, ADDR_TEXT) pair will be to
|
|
|
|
host_name_address_map and to host_address_name_map. (It is the
|
|
|
|
caller's responsibility to make sure that HOST is not already in
|
|
|
|
host_name_address_map.)
|
1999-12-02 02:42:23 -05:00
|
|
|
|
2000-11-19 15:50:10 -05:00
|
|
|
If the ADDR_TEXT has already been seen and belongs to another host,
|
|
|
|
HOST will be added to host_slave_master_map instead. */
|
|
|
|
|
|
|
|
static void
|
|
|
|
add_host_to_cache (const char *host, const char *addr_text)
|
1999-12-02 02:42:23 -05:00
|
|
|
{
|
2000-11-19 15:50:10 -05:00
|
|
|
char *canonical_name = hash_table_get (host_address_name_map, addr_text);
|
|
|
|
if (canonical_name)
|
|
|
|
{
|
|
|
|
DEBUGP (("Mapping %s to %s in host_slave_master_map.\n",
|
|
|
|
host, canonical_name));
|
|
|
|
/* We've already dealt with that host under another name. */
|
|
|
|
hash_table_put (host_slave_master_map,
|
|
|
|
xstrdup_lower (host),
|
|
|
|
xstrdup_lower (canonical_name));
|
|
|
|
}
|
|
|
|
else
|
1999-12-02 02:42:23 -05:00
|
|
|
{
|
2000-11-19 15:50:10 -05:00
|
|
|
/* This is really the first time we're dealing with that host. */
|
|
|
|
char *h_copy = xstrdup_lower (host);
|
|
|
|
char *a_copy = xstrdup (addr_text);
|
|
|
|
DEBUGP (("Caching %s <-> %s\n", h_copy, a_copy));
|
|
|
|
hash_table_put (host_name_address_map, h_copy, a_copy);
|
|
|
|
hash_table_put (host_address_name_map, a_copy, h_copy);
|
1999-12-02 02:42:23 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-11-19 15:50:10 -05:00
|
|
|
/* Store the address of HOSTNAME, internet-style (four octets in
|
|
|
|
network order), to WHERE. First try to get the address from the
|
|
|
|
cache; if it is not available, call the DNS functions and update
|
|
|
|
the cache.
|
1999-12-02 02:42:23 -05:00
|
|
|
|
|
|
|
Return 1 on successful finding of the hostname, 0 otherwise. */
|
|
|
|
int
|
|
|
|
store_hostaddress (unsigned char *where, const char *hostname)
|
|
|
|
{
|
|
|
|
unsigned long addr;
|
2000-11-19 15:50:10 -05:00
|
|
|
char *addr_text;
|
|
|
|
char *canonical_name;
|
1999-12-02 02:42:23 -05:00
|
|
|
struct hostent *hptr;
|
|
|
|
struct in_addr in;
|
|
|
|
char *inet_s;
|
|
|
|
|
|
|
|
/* If the address is of the form d.d.d.d, there will be no trouble
|
|
|
|
with it. */
|
|
|
|
addr = (unsigned long)inet_addr (hostname);
|
|
|
|
/* If we have the numeric address, just store it. */
|
|
|
|
if ((int)addr != -1)
|
|
|
|
{
|
2000-11-19 15:50:10 -05:00
|
|
|
/* ADDR is defined to be in network byte order, meaning the code
|
|
|
|
works on little and big endian 32-bit architectures without
|
|
|
|
change. On big endian 64-bit architectures we need to be
|
|
|
|
careful to copy the correct four bytes. */
|
|
|
|
int offset;
|
|
|
|
have_addr:
|
2000-03-31 09:07:07 -05:00
|
|
|
#ifdef WORDS_BIGENDIAN
|
2000-04-14 05:31:21 -04:00
|
|
|
offset = sizeof (unsigned long) - 4;
|
2000-11-19 15:50:10 -05:00
|
|
|
#else
|
|
|
|
offset = 0;
|
2000-03-31 09:07:07 -05:00
|
|
|
#endif
|
2000-04-14 05:31:21 -04:00
|
|
|
memcpy (where, (char *)&addr + offset, 4);
|
1999-12-02 02:42:23 -05:00
|
|
|
return 1;
|
|
|
|
}
|
2000-11-19 15:50:10 -05:00
|
|
|
|
|
|
|
/* By now we know that the address is not of the form d.d.d.d. Try
|
|
|
|
to find it in our cache of host addresses. */
|
|
|
|
addr_text = hash_table_get (host_name_address_map, hostname);
|
|
|
|
if (addr_text)
|
|
|
|
{
|
|
|
|
DEBUGP (("Found %s in host_name_address_map: %s\n",
|
|
|
|
hostname, addr_text));
|
|
|
|
addr = (unsigned long)inet_addr (addr_text);
|
|
|
|
goto have_addr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Maybe this host is known to us under another name. If so, we'll
|
|
|
|
find it in host_slave_master_map, and use the master name to find
|
|
|
|
its address in host_name_address_map. */
|
|
|
|
canonical_name = hash_table_get (host_slave_master_map, hostname);
|
|
|
|
if (canonical_name)
|
|
|
|
{
|
|
|
|
addr_text = hash_table_get (host_name_address_map, canonical_name);
|
|
|
|
assert (addr_text != NULL);
|
|
|
|
DEBUGP (("Found %s as slave of %s -> %s\n",
|
|
|
|
hostname, canonical_name, addr_text));
|
|
|
|
addr = (unsigned long)inet_addr (addr_text);
|
|
|
|
goto have_addr;
|
|
|
|
}
|
|
|
|
|
1999-12-02 02:42:23 -05:00
|
|
|
/* Since all else has failed, let's try gethostbyname(). Note that
|
|
|
|
we use gethostbyname() rather than ngethostbyname(), because we
|
2000-11-19 15:50:10 -05:00
|
|
|
already know that the address is not numerical. */
|
1999-12-02 02:42:23 -05:00
|
|
|
hptr = gethostbyname (hostname);
|
|
|
|
if (!hptr)
|
|
|
|
return 0;
|
|
|
|
/* Copy the address of the host to socket description. */
|
|
|
|
memcpy (where, hptr->h_addr_list[0], hptr->h_length);
|
2000-11-19 15:50:10 -05:00
|
|
|
assert (hptr->h_length == 4);
|
1999-12-02 02:42:23 -05:00
|
|
|
|
2000-11-19 15:50:10 -05:00
|
|
|
/* Now that we've gone through the truoble of calling
|
|
|
|
gethostbyname(), we can store this valuable information to the
|
|
|
|
cache. First, we have to look for it by address to know if it's
|
|
|
|
already in the cache by another name. */
|
1999-12-02 02:42:23 -05:00
|
|
|
/* Originally, we copied to in.s_addr, but it appears to be missing
|
|
|
|
on some systems. */
|
|
|
|
memcpy (&in, *hptr->h_addr_list, sizeof (in));
|
2000-11-19 15:50:10 -05:00
|
|
|
inet_s = inet_ntoa (in);
|
|
|
|
add_host_to_cache (hostname, inet_s);
|
1999-12-02 02:42:23 -05:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Determine the "real" name of HOST, as perceived by Wget. If HOST
|
|
|
|
is referenced by more than one name, "real" name is considered to
|
2000-11-19 15:50:10 -05:00
|
|
|
be the first one encountered in the past. */
|
1999-12-02 02:42:23 -05:00
|
|
|
char *
|
|
|
|
realhost (const char *host)
|
|
|
|
{
|
|
|
|
struct in_addr in;
|
|
|
|
struct hostent *hptr;
|
2000-11-19 15:50:10 -05:00
|
|
|
char *master_name;
|
1999-12-02 02:42:23 -05:00
|
|
|
|
2000-11-19 15:50:10 -05:00
|
|
|
DEBUGP (("Checking for %s in host_name_address_map.\n", host));
|
2001-05-12 16:06:41 -04:00
|
|
|
if (hash_table_contains (host_name_address_map, host))
|
1999-12-02 02:42:23 -05:00
|
|
|
{
|
2000-11-19 15:50:10 -05:00
|
|
|
DEBUGP (("Found; %s was already used, by that name.\n", host));
|
|
|
|
return xstrdup_lower (host);
|
1999-12-02 02:42:23 -05:00
|
|
|
}
|
2000-11-19 15:50:10 -05:00
|
|
|
|
|
|
|
DEBUGP (("Checking for %s in host_slave_master_map.\n", host));
|
|
|
|
master_name = hash_table_get (host_slave_master_map, host);
|
|
|
|
if (master_name)
|
1999-12-02 02:42:23 -05:00
|
|
|
{
|
2000-11-19 15:50:10 -05:00
|
|
|
has_master:
|
|
|
|
DEBUGP (("Found; %s was already used, by the name %s.\n",
|
|
|
|
host, master_name));
|
|
|
|
return xstrdup (master_name);
|
1999-12-02 02:42:23 -05:00
|
|
|
}
|
|
|
|
|
2000-11-19 15:50:10 -05:00
|
|
|
DEBUGP (("First time I hear about %s by that name; looking it up.\n",
|
|
|
|
host));
|
|
|
|
hptr = ngethostbyname (host);
|
|
|
|
if (hptr)
|
1999-12-02 02:42:23 -05:00
|
|
|
{
|
2000-11-19 15:50:10 -05:00
|
|
|
char *inet_s;
|
|
|
|
/* Originally, we copied to in.s_addr, but it appears to be
|
|
|
|
missing on some systems. */
|
|
|
|
memcpy (&in, *hptr->h_addr_list, sizeof (in));
|
|
|
|
inet_s = inet_ntoa (in);
|
|
|
|
|
|
|
|
add_host_to_cache (host, inet_s);
|
|
|
|
|
|
|
|
/* add_host_to_cache() can establish a slave-master mapping. */
|
|
|
|
DEBUGP (("Checking again for %s in host_slave_master_map.\n", host));
|
|
|
|
master_name = hash_table_get (host_slave_master_map, host);
|
|
|
|
if (master_name)
|
|
|
|
goto has_master;
|
1999-12-02 02:42:23 -05:00
|
|
|
}
|
2000-11-19 15:50:10 -05:00
|
|
|
|
|
|
|
return xstrdup_lower (host);
|
1999-12-02 02:42:23 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Compare two hostnames (out of URL-s if the arguments are URL-s),
|
|
|
|
taking care of aliases. It uses realhost() to determine a unique
|
|
|
|
hostname for each of two hosts. If simple_check is non-zero, only
|
|
|
|
strcmp() is used for comparison. */
|
|
|
|
int
|
|
|
|
same_host (const char *u1, const char *u2)
|
|
|
|
{
|
|
|
|
const char *s;
|
|
|
|
char *p1, *p2;
|
|
|
|
char *real1, *real2;
|
|
|
|
|
|
|
|
/* Skip protocol, if present. */
|
|
|
|
u1 += skip_proto (u1);
|
|
|
|
u2 += skip_proto (u2);
|
|
|
|
|
|
|
|
/* Skip username ans password, if present. */
|
|
|
|
u1 += skip_uname (u1);
|
|
|
|
u2 += skip_uname (u2);
|
|
|
|
|
|
|
|
for (s = u1; *u1 && *u1 != '/' && *u1 != ':'; u1++);
|
|
|
|
p1 = strdupdelim (s, u1);
|
|
|
|
for (s = u2; *u2 && *u2 != '/' && *u2 != ':'; u2++);
|
|
|
|
p2 = strdupdelim (s, u2);
|
|
|
|
DEBUGP (("Comparing hosts %s and %s...\n", p1, p2));
|
|
|
|
if (strcasecmp (p1, p2) == 0)
|
|
|
|
{
|
2000-11-22 11:58:28 -05:00
|
|
|
xfree (p1);
|
|
|
|
xfree (p2);
|
1999-12-02 02:42:23 -05:00
|
|
|
DEBUGP (("They are quite alike.\n"));
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
else if (opt.simple_check)
|
|
|
|
{
|
2000-11-22 11:58:28 -05:00
|
|
|
xfree (p1);
|
|
|
|
xfree (p2);
|
1999-12-02 02:42:23 -05:00
|
|
|
DEBUGP (("Since checking is simple, I'd say they are not the same.\n"));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
real1 = realhost (p1);
|
|
|
|
real2 = realhost (p2);
|
2000-11-22 11:58:28 -05:00
|
|
|
xfree (p1);
|
|
|
|
xfree (p2);
|
1999-12-02 02:42:23 -05:00
|
|
|
if (strcasecmp (real1, real2) == 0)
|
|
|
|
{
|
|
|
|
DEBUGP (("They are alike, after realhost()->%s.\n", real1));
|
2000-11-22 11:58:28 -05:00
|
|
|
xfree (real1);
|
|
|
|
xfree (real2);
|
1999-12-02 02:42:23 -05:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
DEBUGP (("They are not the same (%s, %s).\n", real1, real2));
|
2000-11-22 11:58:28 -05:00
|
|
|
xfree (real1);
|
|
|
|
xfree (real2);
|
1999-12-02 02:42:23 -05:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Determine whether a URL is acceptable to be followed, according to
|
|
|
|
a list of domains to accept. */
|
|
|
|
int
|
|
|
|
accept_domain (struct urlinfo *u)
|
|
|
|
{
|
|
|
|
assert (u->host != NULL);
|
|
|
|
if (opt.domains)
|
|
|
|
{
|
|
|
|
if (!sufmatch ((const char **)opt.domains, u->host))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (opt.exclude_domains)
|
|
|
|
{
|
|
|
|
if (sufmatch ((const char **)opt.exclude_domains, u->host))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check whether WHAT is matched in LIST, each element of LIST being a
|
|
|
|
pattern to match WHAT against, using backward matching (see
|
|
|
|
match_backwards() in utils.c).
|
|
|
|
|
|
|
|
If an element of LIST matched, 1 is returned, 0 otherwise. */
|
|
|
|
int
|
|
|
|
sufmatch (const char **list, const char *what)
|
|
|
|
{
|
|
|
|
int i, j, k, lw;
|
|
|
|
|
|
|
|
lw = strlen (what);
|
|
|
|
for (i = 0; list[i]; i++)
|
|
|
|
{
|
|
|
|
for (j = strlen (list[i]), k = lw; j >= 0 && k >= 0; j--, k--)
|
2000-03-31 09:05:53 -05:00
|
|
|
if (TOLOWER (list[i][j]) != TOLOWER (what[k]))
|
1999-12-02 02:42:23 -05:00
|
|
|
break;
|
|
|
|
/* The domain must be first to reach to beginning. */
|
|
|
|
if (j == -1)
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Print error messages for host errors. */
|
|
|
|
char *
|
|
|
|
herrmsg (int error)
|
|
|
|
{
|
|
|
|
/* Can't use switch since some constants are equal (at least on my
|
|
|
|
system), and the compiler signals "duplicate case value". */
|
|
|
|
if (error == HOST_NOT_FOUND
|
|
|
|
|| error == NO_RECOVERY
|
|
|
|
|| error == NO_DATA
|
|
|
|
|| error == NO_ADDRESS
|
|
|
|
|| error == TRY_AGAIN)
|
|
|
|
return _("Host not found");
|
|
|
|
else
|
|
|
|
return _("Unknown error");
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
clean_hosts (void)
|
|
|
|
{
|
2000-11-19 15:50:10 -05:00
|
|
|
/* host_name_address_map and host_address_name_map share the
|
|
|
|
strings. Because of that, calling free_keys_and_values once
|
|
|
|
suffices for both. */
|
|
|
|
free_keys_and_values (host_name_address_map);
|
|
|
|
hash_table_destroy (host_name_address_map);
|
|
|
|
hash_table_destroy (host_address_name_map);
|
|
|
|
free_keys_and_values (host_slave_master_map);
|
|
|
|
hash_table_destroy (host_slave_master_map);
|
|
|
|
}
|
1999-12-02 02:42:23 -05:00
|
|
|
|
2000-11-19 15:50:10 -05:00
|
|
|
void
|
|
|
|
host_init (void)
|
|
|
|
{
|
|
|
|
host_name_address_map = make_string_hash_table (0);
|
|
|
|
host_address_name_map = make_string_hash_table (0);
|
|
|
|
host_slave_master_map = make_string_hash_table (0);
|
1999-12-02 02:42:23 -05:00
|
|
|
}
|