wget/src/host.c

566 lines
16 KiB
C

/* Dealing with host names.
Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
This file is part of Wget.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#ifdef HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif
#include <assert.h>
#include <sys/types.h>
#ifdef WINDOWS
# include <winsock.h>
#else
# include <sys/socket.h>
# include <netinet/in.h>
# include <arpa/inet.h>
# include <netdb.h>
#endif /* WINDOWS */
#ifdef HAVE_SYS_UTSNAME_H
# include <sys/utsname.h>
#endif
#include <errno.h>
#include "wget.h"
#include "utils.h"
#include "host.h"
#include "url.h"
#ifndef errno
extern int errno;
#endif
/* Host list entry */
struct host
{
/* Host's symbolical name, as encountered at the time of first
inclusion, e.g. "fly.cc.fer.hr". */
char *hostname;
/* Host's "real" name, i.e. its IP address, written out in ASCII
form of N.N.N.N, e.g. "161.53.70.130". */
char *realname;
/* More than one HOSTNAME can correspond to the same REALNAME. For
our purposes, the canonical name of the host is its HOSTNAME when
it was first encountered. This entry is said to have QUALITY. */
int quality;
/* Next entry in the list. */
struct host *next;
};
static struct host *hlist;
static struct host *add_hlist PARAMS ((struct host *, const char *,
const char *, int));
/* The same as gethostbyname, but supports internet addresses of the
form `N.N.N.N'. */
struct hostent *
ngethostbyname (const char *name)
{
struct hostent *hp;
unsigned long addr;
addr = (unsigned long)inet_addr (name);
if ((int)addr != -1)
hp = gethostbyaddr ((char *)&addr, sizeof (addr), AF_INET);
else
hp = gethostbyname (name);
return hp;
}
/* Search for HOST in the linked list L, by hostname. Return the
entry, if found, or NULL. The search is case-insensitive. */
static struct host *
search_host (struct host *l, const char *host)
{
for (; l; l = l->next)
if (strcasecmp (l->hostname, host) == 0)
return l;
return NULL;
}
/* Like search_host, but searches by address. */
static struct host *
search_address (struct host *l, const char *address)
{
for (; l; l = l->next)
{
int cmp = strcmp (l->realname, address);
if (cmp == 0)
return l;
else if (cmp > 0)
return NULL;
}
return NULL;
}
/* Store the address of HOSTNAME, internet-style, to WHERE. First
check for it in the host list, and (if not found), use
ngethostbyname to get it.
Return 1 on successful finding of the hostname, 0 otherwise. */
int
store_hostaddress (unsigned char *where, const char *hostname)
{
struct host *t;
unsigned long addr;
struct hostent *hptr;
struct in_addr in;
char *inet_s;
/* If the address is of the form d.d.d.d, there will be no trouble
with it. */
addr = (unsigned long)inet_addr (hostname);
if ((int)addr == -1)
{
/* If it is not of that form, try to find it in the cache. */
t = search_host (hlist, hostname);
if (t)
addr = (unsigned long)inet_addr (t->realname);
}
/* If we have the numeric address, just store it. */
if ((int)addr != -1)
{
/* ADDR is in network byte order, meaning the code works on
little and big endian 32-bit architectures without change.
On big endian 64-bit architectures we need to be careful to
copy the correct four bytes. */
int offset = 0;
#ifdef WORDS_BIGENDIAN
offset = sizeof (unsigned long) - 4;
#endif
memcpy (where, (char *)&addr + offset, 4);
return 1;
}
/* Since all else has failed, let's try gethostbyname(). Note that
we use gethostbyname() rather than ngethostbyname(), because we
*know* the address is not numerical. */
hptr = gethostbyname (hostname);
if (!hptr)
return 0;
/* Copy the address of the host to socket description. */
memcpy (where, hptr->h_addr_list[0], hptr->h_length);
/* Now that we're here, we could as well cache the hostname for
future use, as in realhost(). First, we have to look for it by
address to know if it's already in the cache by another name. */
/* Originally, we copied to in.s_addr, but it appears to be missing
on some systems. */
memcpy (&in, *hptr->h_addr_list, sizeof (in));
STRDUP_ALLOCA (inet_s, inet_ntoa (in));
t = search_address (hlist, inet_s);
if (t) /* Found in the list, as realname. */
{
/* Set the default, 0 quality. */
hlist = add_hlist (hlist, hostname, inet_s, 0);
return 1;
}
/* Since this is really the first time this host is encountered,
set quality to 1. */
hlist = add_hlist (hlist, hostname, inet_s, 1);
return 1;
}
/* Add a host to the host list. The list is sorted by addresses. For
equal addresses, the entries with quality should bubble towards the
beginning of the list. */
static struct host *
add_hlist (struct host *l, const char *nhost, const char *nreal, int quality)
{
struct host *t, *old, *beg;
/* The entry goes to the beginning of the list if the list is empty
or the order requires it. */
if (!l || (strcmp (nreal, l->realname) < 0))
{
t = (struct host *)xmalloc (sizeof (struct host));
t->hostname = xstrdup (nhost);
t->realname = xstrdup (nreal);
t->quality = quality;
t->next = l;
return t;
}
beg = l;
/* Second two one-before-the-last element. */
while (l->next)
{
int cmp;
old = l;
l = l->next;
cmp = strcmp (nreal, l->realname);
if (cmp >= 0)
continue;
/* If the next list element is greater than s, put s between the
current and the next list element. */
t = (struct host *)xmalloc (sizeof (struct host));
old->next = t;
t->next = l;
t->hostname = xstrdup (nhost);
t->realname = xstrdup (nreal);
t->quality = quality;
return beg;
}
t = (struct host *)xmalloc (sizeof (struct host));
t->hostname = xstrdup (nhost);
t->realname = xstrdup (nreal);
t->quality = quality;
/* Insert the new element after the last element. */
l->next = t;
t->next = NULL;
return beg;
}
/* Determine the "real" name of HOST, as perceived by Wget. If HOST
is referenced by more than one name, "real" name is considered to
be the first one encountered in the past.
If the host cannot be found in the list of already dealt-with
hosts, try with its INET address. If this fails too, add it to the
list. The routine does not call gethostbyname twice for the same
host if it can possibly avoid it. */
char *
realhost (const char *host)
{
struct host *l;
struct in_addr in;
struct hostent *hptr;
char *inet_s;
DEBUGP (("Checking for %s.\n", host));
/* Look for the host, looking by the host name. */
l = search_host (hlist, host);
if (l && l->quality) /* Found it with quality */
{
DEBUGP (("%s was already used, by that name.\n", host));
/* Here we return l->hostname, not host, because of the possible
case differences (e.g. jaGOR.srce.hr and jagor.srce.hr are
the same, but we want the one that was first. */
return xstrdup (l->hostname);
}
else if (!l) /* Not found, with or without quality */
{
/* The fact that gethostbyname will get called makes it
necessary to store it to the list, to ensure that
gethostbyname will not be called twice for the same string.
However, the quality argument must be set appropriately.
Note that add_hlist must be called *after* the realname
search, or the quality would be always set to 0 */
DEBUGP (("This is the first time I hear about host %s by that name.\n",
host));
hptr = ngethostbyname (host);
if (!hptr)
return xstrdup (host);
/* Originally, we copied to in.s_addr, but it appears to be
missing on some systems. */
memcpy (&in, *hptr->h_addr_list, sizeof (in));
STRDUP_ALLOCA (inet_s, inet_ntoa (in));
}
else /* Found, without quality */
{
/* This case happens when host is on the list,
but not as first entry (the one with quality).
Then we just get its INET address and pick
up the first entry with quality. */
DEBUGP (("We've dealt with host %s, but under the name %s.\n",
host, l->realname));
STRDUP_ALLOCA (inet_s, l->realname);
}
/* Now we certainly have the INET address. The following loop is
guaranteed to pick either an entry with quality (because it is
the first one), or none at all. */
l = search_address (hlist, inet_s);
if (l) /* Found in the list, as realname. */
{
/* Set the default, 0 quality. */
hlist = add_hlist (hlist, host, inet_s, 0);
return xstrdup (l->hostname);
}
/* Since this is really the first time this host is encountered,
set quality to 1. */
hlist = add_hlist (hlist, host, inet_s, 1);
return xstrdup (host);
}
/* Compare two hostnames (out of URL-s if the arguments are URL-s),
taking care of aliases. It uses realhost() to determine a unique
hostname for each of two hosts. If simple_check is non-zero, only
strcmp() is used for comparison. */
int
same_host (const char *u1, const char *u2)
{
const char *s;
char *p1, *p2;
char *real1, *real2;
/* Skip protocol, if present. */
u1 += skip_url (u1);
u2 += skip_url (u2);
u1 += skip_proto (u1);
u2 += skip_proto (u2);
/* Skip username ans password, if present. */
u1 += skip_uname (u1);
u2 += skip_uname (u2);
for (s = u1; *u1 && *u1 != '/' && *u1 != ':'; u1++);
p1 = strdupdelim (s, u1);
for (s = u2; *u2 && *u2 != '/' && *u2 != ':'; u2++);
p2 = strdupdelim (s, u2);
DEBUGP (("Comparing hosts %s and %s...\n", p1, p2));
if (strcasecmp (p1, p2) == 0)
{
free (p1);
free (p2);
DEBUGP (("They are quite alike.\n"));
return 1;
}
else if (opt.simple_check)
{
free (p1);
free (p2);
DEBUGP (("Since checking is simple, I'd say they are not the same.\n"));
return 0;
}
real1 = realhost (p1);
real2 = realhost (p2);
free (p1);
free (p2);
if (strcasecmp (real1, real2) == 0)
{
DEBUGP (("They are alike, after realhost()->%s.\n", real1));
free (real1);
free (real2);
return 1;
}
else
{
DEBUGP (("They are not the same (%s, %s).\n", real1, real2));
free (real1);
free (real2);
return 0;
}
}
/* Determine whether a URL is acceptable to be followed, according to
a list of domains to accept. */
int
accept_domain (struct urlinfo *u)
{
assert (u->host != NULL);
if (opt.domains)
{
if (!sufmatch ((const char **)opt.domains, u->host))
return 0;
}
if (opt.exclude_domains)
{
if (sufmatch ((const char **)opt.exclude_domains, u->host))
return 0;
}
return 1;
}
/* Check whether WHAT is matched in LIST, each element of LIST being a
pattern to match WHAT against, using backward matching (see
match_backwards() in utils.c).
If an element of LIST matched, 1 is returned, 0 otherwise. */
int
sufmatch (const char **list, const char *what)
{
int i, j, k, lw;
lw = strlen (what);
for (i = 0; list[i]; i++)
{
for (j = strlen (list[i]), k = lw; j >= 0 && k >= 0; j--, k--)
if (TOLOWER (list[i][j]) != TOLOWER (what[k]))
break;
/* The domain must be first to reach to beginning. */
if (j == -1)
return 1;
}
return 0;
}
/* Return email address of the form username@FQDN suitable for
anonymous FTP passwords. This process is error-prone, and the
escape hatch is the MY_HOST preprocessor constant, which can be
used to hard-code either your hostname or FQDN at compile-time.
If the FQDN cannot be determined, a warning is printed, and the
function returns a short `username@' form, accepted by most
anonymous servers.
If not even the username cannot be divined, it means things are
seriously fucked up, and Wget exits. */
char *
ftp_getaddress (void)
{
static char *address;
/* Do the drill only the first time, as it won't change. */
if (!address)
{
char userid[32]; /* 9 should be enough for Unix, but
I'd rather be on the safe side. */
char *host, *fqdn;
if (!pwd_cuserid (userid))
{
logprintf (LOG_ALWAYS, _("%s: Cannot determine user-id.\n"),
exec_name);
exit (1);
}
#ifdef MY_HOST
STRDUP_ALLOCA (host, MY_HOST);
#else /* not MY_HOST */
#ifdef HAVE_UNAME
{
struct utsname ubuf;
if (uname (&ubuf) < 0)
{
logprintf (LOG_ALWAYS, _("%s: Warning: uname failed: %s\n"),
exec_name, strerror (errno));
fqdn = "";
goto giveup;
}
STRDUP_ALLOCA (host, ubuf.nodename);
}
#else /* not HAVE_UNAME */
#ifdef HAVE_GETHOSTNAME
host = alloca (256);
if (gethostname (host, 256) < 0)
{
logprintf (LOG_ALWAYS, _("%s: Warning: gethostname failed\n"),
exec_name);
fqdn = "";
goto giveup;
}
#else /* not HAVE_GETHOSTNAME */
#error Cannot determine host name.
#endif /* not HAVE_GETHOSTNAME */
#endif /* not HAVE_UNAME */
#endif /* not MY_HOST */
/* If the address we got so far contains a period, don't bother
anymore. */
if (strchr (host, '.'))
fqdn = host;
else
{
/* #### I've seen the following scheme fail on at least one
system! Do we care? */
char *tmpstore;
/* According to Richard Stevens, the correct way to find the
FQDN is to (1) find the host name, (2) find its IP
address using gethostbyname(), and (3) get the FQDN using
gethostbyaddr(). So that's what we'll do. Step one has
been done above. */
/* (2) */
struct hostent *hp = gethostbyname (host);
if (!hp || !hp->h_addr_list)
{
logprintf (LOG_ALWAYS, _("\
%s: Warning: cannot determine local IP address.\n"),
exec_name);
fqdn = "";
goto giveup;
}
/* Copy the argument, so the call to gethostbyaddr doesn't
clobber it -- just in case. */
tmpstore = (char *)alloca (hp->h_length);
memcpy (tmpstore, *hp->h_addr_list, hp->h_length);
/* (3) */
hp = gethostbyaddr (tmpstore, hp->h_length, hp->h_addrtype);
if (!hp || !hp->h_name)
{
logprintf (LOG_ALWAYS, _("\
%s: Warning: cannot reverse-lookup local IP address.\n"),
exec_name);
fqdn = "";
goto giveup;
}
if (!strchr (hp->h_name, '.'))
{
#if 0
/* This gets ticked pretty often. Karl Berry reports
that there can be valid reasons for the local host
name not to be an FQDN, so I've decided to remove the
annoying warning. */
logprintf (LOG_ALWAYS, _("\
%s: Warning: reverse-lookup of local address did not yield FQDN!\n"),
exec_name);
#endif
fqdn = "";
goto giveup;
}
/* Once we're here, hp->h_name contains the correct FQDN. */
STRDUP_ALLOCA (fqdn, hp->h_name);
}
giveup:
address = (char *)xmalloc (strlen (userid) + 1 + strlen (fqdn) + 1);
sprintf (address, "%s@%s", userid, fqdn);
}
return address;
}
/* Print error messages for host errors. */
char *
herrmsg (int error)
{
/* Can't use switch since some constants are equal (at least on my
system), and the compiler signals "duplicate case value". */
if (error == HOST_NOT_FOUND
|| error == NO_RECOVERY
|| error == NO_DATA
|| error == NO_ADDRESS
|| error == TRY_AGAIN)
return _("Host not found");
else
return _("Unknown error");
}
/* Clean the host list. This is a separate function, so we needn't
export HLIST and its implementation. Ha! */
void
clean_hosts (void)
{
struct host *l = hlist;
while (l)
{
struct host *p = l->next;
free (l->hostname);
free (l->realname);
free (l);
l = p;
}
hlist = NULL;
}