wget/src/host.c

525 lines
15 KiB
C

/* Dealing with host names.
Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
This file is part of Wget.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif
#include <assert.h>
#include <sys/types.h>
#ifdef WINDOWS
# include <winsock.h>
#else
# include <sys/socket.h>
# include <netinet/in.h>
# include <arpa/inet.h>
# include <netdb.h>
#endif /* WINDOWS */
#ifdef HAVE_SYS_UTSNAME_H
# include <sys/utsname.h>
#endif
#include <errno.h>
#include "wget.h"
#include "utils.h"
#include "host.h"
#include "url.h"
#include "hash.h"
#ifndef errno
extern int errno;
#endif
/* Mapping between all known hosts to their addresses (n.n.n.n). */
struct hash_table *host_name_address_map;
/* Mapping between all known addresses (n.n.n.n) to their hosts. This
is the inverse of host_name_address_map. These two tables share
the strdup'ed strings. */
struct hash_table *host_address_name_map;
/* Mapping between auxilliary (slave) and master host names. */
struct hash_table *host_slave_master_map;
/* Utility function: like xstrdup(), but also lowercases S. */
static char *
xstrdup_lower (const char *s)
{
char *copy = xstrdup (s);
char *p = copy;
for (; *p; p++)
*p = TOLOWER (*p);
return copy;
}
/* The same as gethostbyname, but supports internet addresses of the
form `N.N.N.N'. On some systems gethostbyname() knows how to do
this automatically. */
struct hostent *
ngethostbyname (const char *name)
{
struct hostent *hp;
unsigned long addr;
addr = (unsigned long)inet_addr (name);
if ((int)addr != -1)
hp = gethostbyaddr ((char *)&addr, sizeof (addr), AF_INET);
else
hp = gethostbyname (name);
return hp;
}
/* Add host name HOST with the address ADDR_TEXT to the cache.
Normally this means that the (HOST, ADDR_TEXT) pair will be to
host_name_address_map and to host_address_name_map. (It is the
caller's responsibility to make sure that HOST is not already in
host_name_address_map.)
If the ADDR_TEXT has already been seen and belongs to another host,
HOST will be added to host_slave_master_map instead. */
static void
add_host_to_cache (const char *host, const char *addr_text)
{
char *canonical_name = hash_table_get (host_address_name_map, addr_text);
if (canonical_name)
{
DEBUGP (("Mapping %s to %s in host_slave_master_map.\n",
host, canonical_name));
/* We've already dealt with that host under another name. */
hash_table_put (host_slave_master_map,
xstrdup_lower (host),
xstrdup_lower (canonical_name));
}
else
{
/* This is really the first time we're dealing with that host. */
char *h_copy = xstrdup_lower (host);
char *a_copy = xstrdup (addr_text);
DEBUGP (("Caching %s <-> %s\n", h_copy, a_copy));
hash_table_put (host_name_address_map, h_copy, a_copy);
hash_table_put (host_address_name_map, a_copy, h_copy);
}
}
/* Store the address of HOSTNAME, internet-style (four octets in
network order), to WHERE. First try to get the address from the
cache; if it is not available, call the DNS functions and update
the cache.
Return 1 on successful finding of the hostname, 0 otherwise. */
int
store_hostaddress (unsigned char *where, const char *hostname)
{
unsigned long addr;
char *addr_text;
char *canonical_name;
struct hostent *hptr;
struct in_addr in;
char *inet_s;
/* If the address is of the form d.d.d.d, there will be no trouble
with it. */
addr = (unsigned long)inet_addr (hostname);
/* If we have the numeric address, just store it. */
if ((int)addr != -1)
{
/* ADDR is defined to be in network byte order, meaning the code
works on little and big endian 32-bit architectures without
change. On big endian 64-bit architectures we need to be
careful to copy the correct four bytes. */
int offset;
have_addr:
#ifdef WORDS_BIGENDIAN
offset = sizeof (unsigned long) - 4;
#else
offset = 0;
#endif
memcpy (where, (char *)&addr + offset, 4);
return 1;
}
/* By now we know that the address is not of the form d.d.d.d. Try
to find it in our cache of host addresses. */
addr_text = hash_table_get (host_name_address_map, hostname);
if (addr_text)
{
DEBUGP (("Found %s in host_name_address_map: %s\n",
hostname, addr_text));
addr = (unsigned long)inet_addr (addr_text);
goto have_addr;
}
/* Maybe this host is known to us under another name. If so, we'll
find it in host_slave_master_map, and use the master name to find
its address in host_name_address_map. */
canonical_name = hash_table_get (host_slave_master_map, hostname);
if (canonical_name)
{
addr_text = hash_table_get (host_name_address_map, canonical_name);
assert (addr_text != NULL);
DEBUGP (("Found %s as slave of %s -> %s\n",
hostname, canonical_name, addr_text));
addr = (unsigned long)inet_addr (addr_text);
goto have_addr;
}
/* Since all else has failed, let's try gethostbyname(). Note that
we use gethostbyname() rather than ngethostbyname(), because we
already know that the address is not numerical. */
hptr = gethostbyname (hostname);
if (!hptr)
return 0;
/* Copy the address of the host to socket description. */
memcpy (where, hptr->h_addr_list[0], hptr->h_length);
assert (hptr->h_length == 4);
/* Now that we've gone through the truoble of calling
gethostbyname(), we can store this valuable information to the
cache. First, we have to look for it by address to know if it's
already in the cache by another name. */
/* Originally, we copied to in.s_addr, but it appears to be missing
on some systems. */
memcpy (&in, *hptr->h_addr_list, sizeof (in));
inet_s = inet_ntoa (in);
add_host_to_cache (hostname, inet_s);
return 1;
}
/* Determine the "real" name of HOST, as perceived by Wget. If HOST
is referenced by more than one name, "real" name is considered to
be the first one encountered in the past. */
char *
realhost (const char *host)
{
struct in_addr in;
struct hostent *hptr;
char *master_name;
DEBUGP (("Checking for %s in host_name_address_map.\n", host));
if (hash_table_exists (host_name_address_map, host))
{
DEBUGP (("Found; %s was already used, by that name.\n", host));
return xstrdup_lower (host);
}
DEBUGP (("Checking for %s in host_slave_master_map.\n", host));
master_name = hash_table_get (host_slave_master_map, host);
if (master_name)
{
has_master:
DEBUGP (("Found; %s was already used, by the name %s.\n",
host, master_name));
return xstrdup (master_name);
}
DEBUGP (("First time I hear about %s by that name; looking it up.\n",
host));
hptr = ngethostbyname (host);
if (hptr)
{
char *inet_s;
/* Originally, we copied to in.s_addr, but it appears to be
missing on some systems. */
memcpy (&in, *hptr->h_addr_list, sizeof (in));
inet_s = inet_ntoa (in);
add_host_to_cache (host, inet_s);
/* add_host_to_cache() can establish a slave-master mapping. */
DEBUGP (("Checking again for %s in host_slave_master_map.\n", host));
master_name = hash_table_get (host_slave_master_map, host);
if (master_name)
goto has_master;
}
return xstrdup_lower (host);
}
/* Compare two hostnames (out of URL-s if the arguments are URL-s),
taking care of aliases. It uses realhost() to determine a unique
hostname for each of two hosts. If simple_check is non-zero, only
strcmp() is used for comparison. */
int
same_host (const char *u1, const char *u2)
{
const char *s;
char *p1, *p2;
char *real1, *real2;
/* Skip protocol, if present. */
u1 += skip_url (u1);
u2 += skip_url (u2);
u1 += skip_proto (u1);
u2 += skip_proto (u2);
/* Skip username ans password, if present. */
u1 += skip_uname (u1);
u2 += skip_uname (u2);
for (s = u1; *u1 && *u1 != '/' && *u1 != ':'; u1++);
p1 = strdupdelim (s, u1);
for (s = u2; *u2 && *u2 != '/' && *u2 != ':'; u2++);
p2 = strdupdelim (s, u2);
DEBUGP (("Comparing hosts %s and %s...\n", p1, p2));
if (strcasecmp (p1, p2) == 0)
{
xfree (p1);
xfree (p2);
DEBUGP (("They are quite alike.\n"));
return 1;
}
else if (opt.simple_check)
{
xfree (p1);
xfree (p2);
DEBUGP (("Since checking is simple, I'd say they are not the same.\n"));
return 0;
}
real1 = realhost (p1);
real2 = realhost (p2);
xfree (p1);
xfree (p2);
if (strcasecmp (real1, real2) == 0)
{
DEBUGP (("They are alike, after realhost()->%s.\n", real1));
xfree (real1);
xfree (real2);
return 1;
}
else
{
DEBUGP (("They are not the same (%s, %s).\n", real1, real2));
xfree (real1);
xfree (real2);
return 0;
}
}
/* Determine whether a URL is acceptable to be followed, according to
a list of domains to accept. */
int
accept_domain (struct urlinfo *u)
{
assert (u->host != NULL);
if (opt.domains)
{
if (!sufmatch ((const char **)opt.domains, u->host))
return 0;
}
if (opt.exclude_domains)
{
if (sufmatch ((const char **)opt.exclude_domains, u->host))
return 0;
}
return 1;
}
/* Check whether WHAT is matched in LIST, each element of LIST being a
pattern to match WHAT against, using backward matching (see
match_backwards() in utils.c).
If an element of LIST matched, 1 is returned, 0 otherwise. */
int
sufmatch (const char **list, const char *what)
{
int i, j, k, lw;
lw = strlen (what);
for (i = 0; list[i]; i++)
{
for (j = strlen (list[i]), k = lw; j >= 0 && k >= 0; j--, k--)
if (TOLOWER (list[i][j]) != TOLOWER (what[k]))
break;
/* The domain must be first to reach to beginning. */
if (j == -1)
return 1;
}
return 0;
}
/* Return email address of the form username@FQDN suitable for
anonymous FTP passwords. This process is error-prone, and the
escape hatch is the MY_HOST preprocessor constant, which can be
used to hard-code either your hostname or FQDN at compile-time.
If the FQDN cannot be determined, a warning is printed, and the
function returns a short `username@' form, accepted by most
anonymous servers.
The returned string is generated by malloc() and should be freed
using free().
If not even the username cannot be divined, it means things are
seriously fucked up, and Wget exits. */
char *
ftp_getaddress (void)
{
static char *address;
/* Do the drill only the first time, as it won't change. */
if (!address)
{
char userid[32]; /* 9 should be enough for Unix, but
I'd rather be on the safe side. */
char *host, *fqdn;
if (!pwd_cuserid (userid))
{
logprintf (LOG_ALWAYS, _("%s: Cannot determine user-id.\n"),
exec_name);
exit (1);
}
#ifdef MY_HOST
STRDUP_ALLOCA (host, MY_HOST);
#else /* not MY_HOST */
#ifdef HAVE_UNAME
{
struct utsname ubuf;
if (uname (&ubuf) < 0)
{
logprintf (LOG_ALWAYS, _("%s: Warning: uname failed: %s\n"),
exec_name, strerror (errno));
fqdn = "";
goto giveup;
}
STRDUP_ALLOCA (host, ubuf.nodename);
}
#else /* not HAVE_UNAME */
#ifdef HAVE_GETHOSTNAME
host = alloca (256);
if (gethostname (host, 256) < 0)
{
logprintf (LOG_ALWAYS, _("%s: Warning: gethostname failed\n"),
exec_name);
fqdn = "";
goto giveup;
}
#else /* not HAVE_GETHOSTNAME */
#error Cannot determine host name.
#endif /* not HAVE_GETHOSTNAME */
#endif /* not HAVE_UNAME */
#endif /* not MY_HOST */
/* If the address we got so far contains a period, don't bother
anymore. */
if (strchr (host, '.'))
fqdn = host;
else
{
/* #### I've seen the following scheme fail on at least one
system! Do we care? */
char *tmpstore;
/* According to Richard Stevens, the correct way to find the
FQDN is to (1) find the host name, (2) find its IP
address using gethostbyname(), and (3) get the FQDN using
gethostbyaddr(). So that's what we'll do. Step one has
been done above. */
/* (2) */
struct hostent *hp = gethostbyname (host);
if (!hp || !hp->h_addr_list)
{
logprintf (LOG_ALWAYS, _("\
%s: Warning: cannot determine local IP address.\n"),
exec_name);
fqdn = "";
goto giveup;
}
/* Copy the argument, so the call to gethostbyaddr doesn't
clobber it -- just in case. */
tmpstore = (char *)alloca (hp->h_length);
memcpy (tmpstore, *hp->h_addr_list, hp->h_length);
/* (3) */
hp = gethostbyaddr (tmpstore, hp->h_length, hp->h_addrtype);
if (!hp || !hp->h_name)
{
logprintf (LOG_ALWAYS, _("\
%s: Warning: cannot reverse-lookup local IP address.\n"),
exec_name);
fqdn = "";
goto giveup;
}
if (!strchr (hp->h_name, '.'))
{
#if 0
/* This gets ticked pretty often. Karl Berry reports
that there can be valid reasons for the local host
name not to be an FQDN, so I've decided to remove the
annoying warning. */
logprintf (LOG_ALWAYS, _("\
%s: Warning: reverse-lookup of local address did not yield FQDN!\n"),
exec_name);
#endif
fqdn = "";
goto giveup;
}
/* Once we're here, hp->h_name contains the correct FQDN. */
STRDUP_ALLOCA (fqdn, hp->h_name);
}
giveup:
address = (char *)xmalloc (strlen (userid) + 1 + strlen (fqdn) + 1);
sprintf (address, "%s@%s", userid, fqdn);
}
return address;
}
/* Print error messages for host errors. */
char *
herrmsg (int error)
{
/* Can't use switch since some constants are equal (at least on my
system), and the compiler signals "duplicate case value". */
if (error == HOST_NOT_FOUND
|| error == NO_RECOVERY
|| error == NO_DATA
|| error == NO_ADDRESS
|| error == TRY_AGAIN)
return _("Host not found");
else
return _("Unknown error");
}
void
clean_hosts (void)
{
/* host_name_address_map and host_address_name_map share the
strings. Because of that, calling free_keys_and_values once
suffices for both. */
free_keys_and_values (host_name_address_map);
hash_table_destroy (host_name_address_map);
hash_table_destroy (host_address_name_map);
free_keys_and_values (host_slave_master_map);
hash_table_destroy (host_slave_master_map);
}
void
host_init (void)
{
host_name_address_map = make_string_hash_table (0);
host_address_name_map = make_string_hash_table (0);
host_slave_master_map = make_string_hash_table (0);
}