From 6e598c81e36a23ae6c0651fab303a129de68522c Mon Sep 17 00:00:00 2001 From: hniksic Date: Mon, 20 Nov 2000 18:06:36 -0800 Subject: [PATCH] [svn] Committed a bunch of different tweaks of mine. Published in . --- src/ChangeLog | 37 +++++ src/hash.c | 373 +++++++++++++++++++++++++++++++++++-------------- src/html-url.c | 9 +- src/http.c | 19 +-- src/recur.c | 71 +++++----- src/url.c | 22 ++- src/url.h | 34 +++-- src/utils.c | 57 +++++++- src/utils.h | 2 + 9 files changed, 439 insertions(+), 185 deletions(-) diff --git a/src/ChangeLog b/src/ChangeLog index ed96827b..642f94a8 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,40 @@ +2000-11-21 Hrvoje Niksic + + * hash.c (find_mapping): New function. + (hash_table_get): Use it. + (hash_table_get_pair): Ditto. + (hash_table_exists): Ditto. + (hash_table_remove): Ditto. + (hash_table_remove): Really delete the entry if the mapping + following LOCATION is empty. + + * utils.c (string_set_add): Check whether the element has existed + before. + + * hash.c (hash_table_get_pair): New function. + +2000-11-20 Hrvoje Niksic + + * http.c (http_process_type): Ignore trailing whitespace; use + strdupdelim(). + + * recur.c (recursive_retrieve): Use the new `convert' field. + (convert_all_links): Ditto. + (convert_all_links): Don't respect meta_disallow_follow. + + * html-url.c (handle_link): Fill out link_relative_p and + link_complete_p. + + * url.h (struct _urlpos): Make elements more readable. + + * recur.c (recursive_retrieve): Call slist_prepend instead of + slist_append. + (convert_all_links): Call slist_nreverse before iterating through + urls_html. + + * utils.c (slist_prepend): New function. + (slist_nreverse): Ditto. + 2000-11-20 Hrvoje Niksic * http.c (check_end): Constify. diff --git a/src/hash.c b/src/hash.c index e54fb33a..2bbb0a12 100644 --- a/src/hash.c +++ b/src/hash.c @@ -5,8 +5,8 @@ This file is part of Wget. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -34,19 +34,91 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ # define xrealloc realloc #endif -/* This file implements simple hash tables based on linear probing. - The hash table stores key-value pairs in a contiguous array. Both - key and value are void pointers that the hash and test functions - know how to handle. +/* INTERFACE: - Although Knuth & co. recommend double hashing over linear probing, - we use the latter because it accesses array elements sequentially - in case of a collision, yielding in better cache behaviour and - ultimately in better speed. To avoid collision problems with - linear probing, we make sure that the table grows as soon as the - fullness/size ratio exceeds 75%. */ + Hash tables are an implementation technique used to implement + mapping between objects. Provided a good hashing function is used, + they guarantee constant-time access and storing of information. + Duplicate keys are not allowed. -struct ht_pair { + The basics are all covered. hash_table_new creates a hash table, + and hash_table_destroy deletes it. hash_table_put establishes a + mapping between a key and a value. hash_table_get retrieves the + value that corresponds to a key. hash_table_exists queries whether + a key is stored in a table at all. hash_table_remove removes a + mapping that corresponds to a key. hash_table_map allows you to + map through all the entries in a hash table. hash_table_clear + clears all the entries from the hash table. + + The number of mappings in a table is not limited, except by the + amount of memory. As you add new elements to a table, it regrows + as necessary. If you have an idea about how many elements you will + store, you can provide a hint to hash_table_new(). + + The hashing and equality functions are normally provided by the + user. For the special (and frequent) case of hashing strings, you + can use the pre-canned make_string_hash_table(), which provides the + string hashing function from the Dragon Book, and a string equality + wrapper around strcmp(). + + When specifying your own hash and test functions, make sure the + following holds true: + + - The test function returns non-zero for keys that are considered + "equal", zero otherwise. + + - The hash function returns a number that represents the + "distinctness" of the object. In more precise terms, it means + that for any two objects that test "equal" under the test + function, the hash function MUST produce the same result. + + This does not mean that each distinct object must produce a + distinct value, only that non-distinct objects must produce the + same values! For instance, a hash function that returns 0 for + any given object is a perfectly valid (albeit extremely bad) hash + function. + + The above stated rule is quite easy to enforce. For example, if + your testing function compares strings case-insensitively, all + your function needs to do is lower-case the string characters + before calculating a hash. That way you have easily guaranteed + that changes in case will not result in a different hash. + + - (optional) Choose the hash function to get as good "spreading" as + possible. A good hash function will react to even a small change + in its input with a completely different resulting hash. + Finally, don't make your hash function extremely slow, because + you're then defeating the purpose of hashing. + + Note that neither keys nor values are copied when inserted into the + hash table, so they must exist for the lifetime of the table. This + means that e.g. the use of static strings is OK, but objects with a + shorter life-time need to be copied (with strdup() or the like in + the case of strings) before being inserted. */ + +/* IMPLEMENTATION: + + All the hash mappings (key-value pairs of pointers) are stored in a + contiguous array. The position of each mapping is determined by + applying the hash function to the key: location = hash(key) % size. + If two different keys end up on the same position, the collision is + resolved by placing the second mapping at the next empty place in + the array following the occupied place. This method of collision + resolution is called "linear probing". + + There are more advanced collision resolution mechanisms (quadratic + probing, double hashing), but we don't use them because they + involve more non-sequential access to the array, and therefore + worse cache behavior. Linear probing works well as long as the + fullness/size ratio is kept below 75%. We make sure to regrow or + rehash the hash table whenever this threshold is exceeded. + + Collisions make deletion tricky because finding collisions again + relies on new empty spots not being created. That's why + hash_table_remove only marks the spot as deleted rather than really + making it empty. */ + +struct mapping { void *key; void *value; }; @@ -60,13 +132,14 @@ struct hash_table { int count; /* number of non-empty, non-deleted fields. */ - struct ht_pair *pairs; + struct mapping *mappings; }; #define ENTRY_DELETED ((void *)0xdeadbeef) +#define ENTRY_EMPTY NULL #define DELETED_ENTRY_P(ptr) ((ptr) == ENTRY_DELETED) -#define EMPTY_ENTRY_P(ptr) ((ptr) == NULL) +#define EMPTY_ENTRY_P(ptr) ((ptr) == ENTRY_EMPTY) /* Find a prime near, but greather than or equal to SIZE. */ @@ -109,8 +182,8 @@ hash_table_new (int initial_size, ht->size = prime_size (initial_size); ht->fullness = 0; ht->count = 0; - ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair)); - memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair)); + ht->mappings = xmalloc (ht->size * sizeof (struct mapping)); + memset (ht->mappings, '\0', ht->size * sizeof (struct mapping)); return ht; } @@ -119,34 +192,74 @@ hash_table_new (int initial_size, void hash_table_destroy (struct hash_table *ht) { - free (ht->pairs); + free (ht->mappings); free (ht); } +/* The heart of almost all functions in this file -- find the mapping + whose KEY is equal to key, using a linear probing loop. Returns + the offset of the mapping in ht->mappings. This should probably be + declared inline. */ + +static int +find_mapping (struct hash_table *ht, const void *key) +{ + struct mapping *mappings = ht->mappings; + int size = ht->size; + int location = ht->hash_function (key) % size; + while (1) + { + struct mapping *mp = mappings + location; + void *mp_key = mp->key; + + if (EMPTY_ENTRY_P (mp_key)) + return -1; + else if (DELETED_ENTRY_P (mp_key) + || !ht->test_function (key, mp_key)) + { + if (++location == size) + location = 0; + } + else + return location; + } +} + /* Get the value that corresponds to the key KEY in the hash table HT. If no value is found, return NULL. Note that NULL is a legal value for value; if you are storing NULLs in your hash table, you can use hash_table_exists to be sure that a (possibly NULL) value exists in - the table. */ + the table. Or, you can use hash_table_get_pair instead of this + function. */ void * hash_table_get (struct hash_table *ht, const void *key) { - int location = ht->hash_function (key) % ht->size; - while (1) + int location = find_mapping (ht, key); + if (location < 0) + return NULL; + else + return ht->mappings[location].value; +} + +/* Like hash_table_get, but writes out the pointers to both key and + value. Returns non-zero on success. */ + +int +hash_table_get_pair (struct hash_table *ht, const void *lookup_key, + void *orig_key, void *value) +{ + int location = find_mapping (ht, lookup_key); + if (location < 0) + return 0; + else { - struct ht_pair *the_pair = ht->pairs + location; - if (EMPTY_ENTRY_P (the_pair->key)) - return NULL; - else if (DELETED_ENTRY_P (the_pair->key) - || !ht->test_function (key, the_pair->key)) - { - ++location; - if (location == ht->size) - location = 0; - } - else - return the_pair->value; + struct mapping *mp = ht->mappings + location; + if (orig_key) + *(void **)orig_key = mp->key; + if (value) + *(void **)value = mp->value; + return 1; } } @@ -155,39 +268,25 @@ hash_table_get (struct hash_table *ht, const void *key) int hash_table_exists (struct hash_table *ht, const void *key) { - int location = ht->hash_function (key) % ht->size; - while (1) - { - struct ht_pair *the_pair = ht->pairs + location; - if (EMPTY_ENTRY_P (the_pair->key)) - return 0; - else if (DELETED_ENTRY_P (the_pair->key) - || !ht->test_function (key, the_pair->key)) - { - ++location; - if (location == ht->size) - location = 0; - } - else - return 1; - } + return find_mapping (ht, key) >= 0; } #define MAX(i, j) (((i) >= (j)) ? (i) : (j)) /* Grow hash table HT as necessary, and rehash all the key-value - pairs. */ + mappings. */ static void grow_hash_table (struct hash_table *ht) { int i; - struct ht_pair *old_pairs = ht->pairs; + struct mapping *old_mappings = ht->mappings; int old_count = ht->count; /* for assert() below */ int old_size = ht->size; - /* Normally, the idea is to double ht->size (and round it to next - prime) on each regrow: + /* To minimize the number of regrowth, we'd like to resize the hash + table exponentially. Normally, this would be done by doubling + ht->size (and round it to next prime) on each regrow: ht->size = prime_size (ht->size * 2); @@ -198,24 +297,28 @@ grow_hash_table (struct hash_table *ht) only because we don't want to actually shrink the table. (But maybe that's wrong.) */ - int needed_size = prime_size (ht->count * 2); + int needed_size = prime_size (ht->count * 3); ht->size = MAX (old_size, needed_size); - ht->pairs = xmalloc (ht->size * sizeof (struct ht_pair)); - memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair)); + printf ("growing from %d to %d\n", old_size, ht->size); + + ht->mappings = xmalloc (ht->size * sizeof (struct mapping)); + memset (ht->mappings, '\0', ht->size * sizeof (struct mapping)); /* Need to reset these two; hash_table_put will reinitialize them. */ ht->fullness = 0; ht->count = 0; for (i = 0; i < old_size; i++) { - struct ht_pair *the_pair = old_pairs + i; - if (!EMPTY_ENTRY_P (the_pair->key) - && !DELETED_ENTRY_P (the_pair->key)) - hash_table_put (ht, the_pair->key, the_pair->value); + struct mapping *mp = old_mappings + i; + void *mp_key = mp->key; + + if (!EMPTY_ENTRY_P (mp_key) + && !DELETED_ENTRY_P (mp_key)) + hash_table_put (ht, mp_key, mp->value); } assert (ht->count == old_count); - free (old_pairs); + free (old_mappings); } /* Put VALUE in the hash table HT under the key KEY. This regrows the @@ -224,27 +327,34 @@ grow_hash_table (struct hash_table *ht) void hash_table_put (struct hash_table *ht, const void *key, void *value) { - int location = ht->hash_function (key) % ht->size; + /* Cannot use find_mapping here because we treat deleted entries + specially. */ + + struct mapping *mappings = ht->mappings; + int size = ht->size; + int location = ht->hash_function (key) % size; while (1) { - struct ht_pair *the_pair = ht->pairs + location; - if (EMPTY_ENTRY_P (the_pair->key)) + struct mapping *mp = mappings + location; + void *mp_key = mp->key; + + if (EMPTY_ENTRY_P (mp_key)) { ++ht->fullness; ++ht->count; just_insert: - the_pair->key = (void *)key; /* const? */ - the_pair->value = value; + mp->key = (void *)key; /* const? */ + mp->value = value; break; } - else if (DELETED_ENTRY_P (the_pair->key)) + else if (DELETED_ENTRY_P (mp_key)) { /* We're replacing a deleteed entry, so ht->count gets increased, but ht->fullness remains unchanged. */ ++ht->count; goto just_insert; } - else if (ht->test_function (key, the_pair->key)) + else if (ht->test_function (key, mp_key)) { /* We're replacing an existing entry, so ht->count and ht->fullness remain unchanged. */ @@ -252,8 +362,7 @@ hash_table_put (struct hash_table *ht, const void *key, void *value) } else { - ++location; - if (location == ht->size) + if (++location == size) location = 0; } } @@ -267,60 +376,79 @@ hash_table_put (struct hash_table *ht, const void *key, void *value) int hash_table_remove (struct hash_table *ht, const void *key) { - int location = ht->hash_function (key) % ht->size; - while (1) + int location = find_mapping (ht, key); + if (location < 0) + return 0; + else { - struct ht_pair *the_pair = ht->pairs + location; - if (EMPTY_ENTRY_P (the_pair->key)) - return 0; - else if (DELETED_ENTRY_P (the_pair->key) - || !ht->test_function (key, the_pair->key)) + struct mapping *mappings = ht->mappings; + struct mapping *mp = mappings + location; + /* We don't really remove an entry from the hash table: we just + mark it as deleted. This is because there may be other + entries located after this entry whose hash points to a + location before this entry. (Example: keys A, B and C have + the same hash. If you were to really *delete* B from the + table, C could no longer be found.) */ + + /* Optimization addendum: if the mapping that follows LOCATION + is already empty, that is a sure sign that nobody depends on + LOCATION being non-empty. (This is because we're using + linear probing. This would not be the case with double + hashing.) In that case, we may safely delete the mapping. */ + + /* This could be generalized so that the all the non-empty + locations following LOCATION are simply shifted leftward. It + would make deletion a bit slower, but it would remove the + ugly DELETED_ENTRY_P checks from all the rest of the code, + making the whole thing faster. */ + int location_after = (location + 1) == ht->size ? 0 : location + 1; + struct mapping *mp_after = mappings + location_after; + + if (EMPTY_ENTRY_P (mp_after->key)) { - ++location; - if (location == ht->size) - location = 0; + mp->key = ENTRY_EMPTY; + --ht->fullness; } else - { - /* We don't really remove an entry from the hash table: we - just mark it as deleted. This is because there may be - other entries located after this entry whose hash number - points to a location before this entry. (Example: keys - A, B and C have the same hash. If you were to really - *delete* B from the table, C could no longer be found.) + mp->key = ENTRY_DELETED; - As an optimization, it might be worthwhile to check - whether the immediately preceding entry is empty and, if - so, really delete the pair (set it to empty and decrease - the fullness along with the count). I *think* it should - be safe. */ - the_pair->key = ENTRY_DELETED; - --ht->count; - return 1; - } + --ht->count; + return 1; } } +/* Clear HT of all entries. After calling this function, the count + and the fullness of the hash table will be zero. The size will + remain unchanged. */ + void hash_table_clear (struct hash_table *ht) { - memset (ht->pairs, '\0', ht->size * sizeof (struct ht_pair)); + memset (ht->mappings, '\0', ht->size * sizeof (struct mapping)); ht->fullness = 0; ht->count = 0; } +/* Map MAPFUN over all the mappings in hash table HT. MAPFUN is + called with three arguments: the key, the value, and the CLOSURE. + Don't add or remove entries from HT while hash_table_map is being + called, or strange things may happen. */ + void hash_table_map (struct hash_table *ht, int (*mapfun) (void *, void *, void *), void *closure) { + struct mapping *mappings = ht->mappings; int i; for (i = 0; i < ht->size; i++) { - struct ht_pair *the_pair = ht->pairs + i; - if (!EMPTY_ENTRY_P (the_pair->key) - && !DELETED_ENTRY_P (the_pair->key)) - if (mapfun (the_pair->key, the_pair->value, closure)) + struct mapping *mp = mappings + i; + void *mp_key = mp->key; + + if (!EMPTY_ENTRY_P (mp_key) + && !DELETED_ENTRY_P (mp_key)) + if (mapfun (mp_key, mp->value, closure)) return; } } @@ -345,12 +473,33 @@ string_hash (const void *sv) return h; } +#if 0 +/* If I ever need it: hashing of integers. */ + +unsigned int +inthash (unsigned int key) +{ + key += (key << 12); + key ^= (key >> 22); + key += (key << 4); + key ^= (key >> 9); + key += (key << 10); + key ^= (key >> 2); + key += (key << 7); + key ^= (key >> 12); + return key; +} +#endif + int string_cmp (const void *s1, const void *s2) { return !strcmp ((const char *)s1, (const char *)s2); } +/* Return a hash table of initial size INITIAL_SIZE suitable to use + strings as keys. */ + struct hash_table * make_string_hash_table (int initial_size) { @@ -364,7 +513,7 @@ make_string_hash_table (int initial_size) #include int -print_hash_table_mapper (const void *key, void *value, void *count) +print_hash_table_mapper (void *key, void *value, void *count) { ++*(int *)count; printf ("%s: %s\n", (const char *)key, (char *)value); @@ -390,12 +539,24 @@ main (void) if (len <= 1) continue; line[--len] = '\0'; - hash_table_put (ht, strdup (line), "here I am!"); - if (len % 2) - hash_table_remove (ht, line); + if (!hash_table_exists (ht, line)) + hash_table_put (ht, strdup (line), "here I am!"); +#if 1 + if (len % 3) + { + char *line_copy; + if (hash_table_get_pair (ht, line, &line_copy, NULL)) + { + hash_table_remove (ht, line); + free (line_copy); + } + } +#endif } - print_hash (ht); #if 0 + print_hash (ht); +#endif +#if 1 printf ("%d %d %d\n", ht->count, ht->fullness, ht->size); #endif return 0; diff --git a/src/html-url.c b/src/html-url.c index 0441b470..dd59f18d 100644 --- a/src/html-url.c +++ b/src/html-url.c @@ -348,12 +348,11 @@ handle_link (struct collect_urls_closure *closure, const char *link_uri, newel->size = tag->attrs[attrid].value_raw_size; /* A URL is relative if the host and protocol are not named, and the - name does not start with `/'. - #### This logic might need some rethinking. */ + name does not start with `/'. */ if (no_proto && *link_uri != '/') - newel->flags |= (URELATIVE | UNOPROTO); - else if (no_proto) - newel->flags |= UNOPROTO; + newel->link_relative_p = 1; + else if (!no_proto) + newel->link_complete_p = 1; if (closure->tail) { diff --git a/src/http.c b/src/http.c index 4e14b3d6..8c36a255 100644 --- a/src/http.c +++ b/src/http.c @@ -239,18 +239,13 @@ static int http_process_type (const char *hdr, void *arg) { char **result = (char **)arg; - char *p; - - p = strrchr (hdr, ';'); - if (p) - { - int len = p - hdr; - *result = (char *)xmalloc (len + 1); - memcpy (*result, hdr, len); - (*result)[len] = '\0'; - } - else - *result = xstrdup (hdr); + /* Locate P on `;' or the terminating zero, whichever comes first. */ + const char *p = strchr (hdr, ';'); + if (!p) + p = hdr + strlen (hdr); + while (p > hdr && ISSPACE (*(p - 1))) + --p; + *result = strdupdelim (hdr, p); return 1; } diff --git a/src/recur.c b/src/recur.c index 8ee4e318..904d3671 100644 --- a/src/recur.c +++ b/src/recur.c @@ -168,7 +168,7 @@ recursive_retrieve (const char *file, const char *this_url) string_set_add (undesirable_urls, u->url); hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (u->url)); hash_table_put (dl_url_file_map, xstrdup (u->url), xstrdup (file)); - urls_html = slist_append (urls_html, file); + urls_html = slist_prepend (urls_html, file); if (opt.no_parent) base_dir = xstrdup (u->dir); /* Set the base dir. */ /* Set the canonical this_url to be sent as referer. This @@ -289,7 +289,7 @@ recursive_retrieve (const char *file, const char *this_url) /* If it is absolute link and they are not followed, chuck it out. */ if (!inl && u->proto != URLFTP) - if (opt.relative_only && !(cur_url->flags & URELATIVE)) + if (opt.relative_only && !cur_url->link_relative_p) { DEBUGP (("It doesn't really look like a relative link.\n")); string_set_add (undesirable_urls, constr); @@ -479,7 +479,7 @@ recursive_retrieve (const char *file, const char *this_url) xstrdup (constr), xstrdup (filename)); /* If the URL is HTML, note it. */ if (dt & TEXTHTML) - urls_html = slist_append (urls_html, filename); + urls_html = slist_prepend (urls_html, filename); } } /* If there was no error, and the type is text/html, parse @@ -514,7 +514,7 @@ recursive_retrieve (const char *file, const char *this_url) store the local filename. */ if (opt.convert_links && (dt & RETROKF) && (filename != NULL)) { - cur_url->flags |= UABS2REL; + cur_url->convert = CO_CONVERT_TO_RELATIVE; cur_url->local_name = xstrdup (filename); } } @@ -544,12 +544,13 @@ recursive_retrieve (const char *file, const char *this_url) return RETROK; } -/* Simple calls to convert_links will often fail because only the - downloaded files are converted, and Wget cannot know which files - will be converted in the future. So, if we have file fileone.html - with: +/* convert_links() is called from recursive_retrieve() after we're + done with an HTML file. This call to convert_links is not complete + because it converts only the downloaded files, and Wget cannot know + which files will be downloaded afterwards. So, if we have file + fileone.html with: - + and /c/something.gif was not downloaded because it exceeded the recursion depth, the reference will *not* be changed. @@ -572,14 +573,15 @@ recursive_retrieve (const char *file, const char *this_url) void convert_all_links (void) { - uerr_t res; - urlpos *l1, *urls; - struct urlinfo *u; slist *html; + /* Destructively reverse urls_html to get it in the right order. + recursive_retrieve() used slist_prepend() consistently. */ + urls_html = slist_nreverse (urls_html); + for (html = urls_html; html; html = html->next) { - int meta_disallow_follow; + urlpos *urls, *cur_url; char *url; DEBUGP (("Rescanning %s\n", html->string)); @@ -591,22 +593,17 @@ convert_all_links (void) else DEBUGP (("I cannot find the corresponding URL.\n")); /* Parse the HTML file... */ - urls = get_urls_html (html->string, url, FALSE, &meta_disallow_follow); - if (opt.use_robots && meta_disallow_follow) - { - /* The META tag says we are not to follow this file. - Respect that. */ - free_urlpos (urls); - urls = NULL; - } - if (!urls) - continue; - for (l1 = urls; l1; l1 = l1->next) + urls = get_urls_html (html->string, url, FALSE, NULL); + /* We don't respect meta_disallow_follow here because, even if + the file is not followed, we might still want to convert the + links that have been followed from other files. */ + for (cur_url = urls; cur_url; cur_url = cur_url->next) { char *local_name; + /* The URL must be in canonical form to be compared. */ - u = newurl (); - res = parseurl (l1->url, u, 0); + struct urlinfo *u = newurl (); + uerr_t res = parseurl (cur_url->url, u, 0); if (res != URLOK) { freeurl (u, 1); @@ -617,20 +614,28 @@ convert_all_links (void) ABS2REL, whereas non-downloaded will be converted REL2ABS. */ local_name = hash_table_get (dl_url_file_map, u->url); if (local_name) - DEBUGP (("%s flagged for conversion, local %s\n", + DEBUGP (("%s marked for conversion, local %s\n", u->url, local_name)); - /* Clear the flags. */ - l1->flags &= ~ (UABS2REL | UREL2ABS); /* Decide on the conversion direction. */ if (local_name) { - l1->flags |= UABS2REL; - l1->local_name = xstrdup (local_name); + /* We've downloaded this URL. Convert it to relative + form. We do this even if the URL already is in + relative form, because our directory structure may + not be identical to that on the server (think `-nd', + `--cut-dirs', etc.) */ + cur_url->convert = CO_CONVERT_TO_RELATIVE; + cur_url->local_name = xstrdup (local_name); } else { - l1->flags |= UREL2ABS; - l1->local_name = NULL; + /* We haven't downloaded this URL. If it's not already + complete (including a full host name), convert it to + that form, so it can be reached while browsing this + HTML locally. */ + if (!cur_url->link_complete_p) + cur_url->convert = CO_CONVERT_TO_COMPLETE; + cur_url->local_name = NULL; } freeurl (u, 1); } diff --git a/src/url.c b/src/url.c index c077f000..dcb51ac7 100644 --- a/src/url.c +++ b/src/url.c @@ -1313,6 +1313,8 @@ convert_links (const char *file, urlpos *l) char *p; downloaded_file_t downloaded_file_return; + logprintf (LOG_VERBOSE, _("Converting %s... "), file); + { /* First we do a "dry run": go through the list L and see whether any URL needs to be converted in the first place. If not, just @@ -1320,18 +1322,15 @@ convert_links (const char *file, urlpos *l) int count = 0; urlpos *dry = l; for (dry = l; dry; dry = dry->next) - if (dry->flags & (UABS2REL | UREL2ABS)) + if (dry->convert != CO_NOCONVERT) ++count; if (!count) { - logprintf (LOG_VERBOSE, _("Nothing to do while converting %s.\n"), - file); + logputs (LOG_VERBOSE, _("nothing to do.\n")); return; } } - logprintf (LOG_VERBOSE, _("Converting %s... "), file); - fm = read_file (file); if (!fm) { @@ -1376,10 +1375,9 @@ convert_links (const char *file, urlpos *l) break; } /* If the URL is not to be converted, skip it. */ - if (!(l->flags & (UABS2REL | UREL2ABS))) + if (l->convert == CO_NOCONVERT) { - DEBUGP (("Skipping %s at position %d (flags %d).\n", l->url, - l->pos, l->flags)); + DEBUGP (("Skipping %s at position %d.\n", l->url, l->pos)); continue; } @@ -1387,7 +1385,7 @@ convert_links (const char *file, urlpos *l) quote, to the outfile. */ fwrite (p, 1, url_start - p, fp); p = url_start; - if (l->flags & UABS2REL) + if (l->convert == CO_CONVERT_TO_RELATIVE) { /* Convert absolute URL to relative. */ char *newname = construct_relative (file, l->local_name); @@ -1396,11 +1394,11 @@ convert_links (const char *file, urlpos *l) p += l->size - 1; putc (*p, fp); /* close quote */ ++p; - DEBUGP (("ABS2REL: %s to %s at position %d in %s.\n", + DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n", l->url, newname, l->pos, file)); free (newname); } - else if (l->flags & UREL2ABS) + else if (l->convert == CO_CONVERT_TO_COMPLETE) { /* Convert the link to absolute URL. */ char *newlink = l->url; @@ -1409,7 +1407,7 @@ convert_links (const char *file, urlpos *l) p += l->size - 1; putc (*p, fp); /* close quote */ ++p; - DEBUGP (("REL2ABS: to %s at position %d in %s.\n", + DEBUGP (("TO_COMPLETE: to %s at position %d in %s.\n", newlink, l->pos, file)); } } diff --git a/src/url.h b/src/url.h index 648193fa..bfe78426 100644 --- a/src/url.h +++ b/src/url.h @@ -44,23 +44,35 @@ struct urlinfo document */ }; -enum uflags -{ - URELATIVE = 0x0001, /* Is URL relative? */ - UNOPROTO = 0x0002, /* Is URL without a protocol? */ - UABS2REL = 0x0004, /* Convert absolute to relative? */ - UREL2ABS = 0x0008 /* Convert relative to absolute? */ +enum convert_options { + CO_NOCONVERT = 0, /* don't convert this URL */ + CO_CONVERT_TO_RELATIVE, /* convert to relative, e.g. to + "../../otherdir/foo.gif" */ + CO_CONVERT_TO_COMPLETE /* convert to absolute, e.g. to + "http://orighost/somedir/bar.jpg". */ }; /* A structure that defines the whereabouts of a URL, i.e. its position in an HTML document, etc. */ + typedef struct _urlpos { - char *url; /* URL */ - char *local_name; /* Local file to which it was saved */ - enum uflags flags; /* Various flags */ - int pos, size; /* Relative position in the buffer */ - struct _urlpos *next; /* Next struct in list */ + char *url; /* linked URL, after it has been + merged with the base */ + char *local_name; /* Local file to which it was saved */ + + /* Information about the original link: */ + int link_relative_p; /* was the link relative? */ + int link_complete_p; /* was the link complete (with the + host name, etc.) */ + + /* Conversion requirements: */ + enum convert_options convert; /* is conversion required? */ + + /* URL's position in the buffer. */ + int pos, size; + + struct _urlpos *next; /* Next struct in list */ } urlpos; /* downloaded_file() takes a parameter of this type and returns this type. */ diff --git a/src/utils.c b/src/utils.c index a6a08e2a..0d4de29f 100644 --- a/src/utils.c +++ b/src/utils.c @@ -931,7 +931,19 @@ merge_vecs (char **v1, char **v2) This used to also be used for searching, but now we have hash tables for that. */ -/* Append an element to the list. */ +/* It's a shame that these simple things like linked lists and hash + tables (see hash.c) need to be implemented over and over again. It + would be nice to be able to use the routines from glib -- see + www.gtk.org for details. However, that would make Wget depend on + glib, and I want to avoid dependencies to external libraries for + reasons of convenience and portability (I suspect Wget is more + portable than anything ever written for Gnome). */ + +/* Append an element to the list. If the list has a huge number of + elements, this can get slow because it has to find the list's + ending. If you think you have to call slist_append in a loop, + think about calling slist_prepend() followed by slist_nreverse(). */ + slist * slist_append (slist *l, const char *s) { @@ -950,6 +962,33 @@ slist_append (slist *l, const char *s) return beg; } +/* Prepend S to the list. Unlike slist_append(), this is O(1). */ + +slist * +slist_prepend (slist *l, const char *s) +{ + slist *newel = (slist *)xmalloc (sizeof (slist)); + newel->string = xstrdup (s); + newel->next = l; + return newel; +} + +/* Destructively reverse L. */ + +slist * +slist_nreverse (slist *l) +{ + slist *prev = NULL; + while (l) + { + slist *next = l->next; + l->next = prev; + prev = l; + l = next; + } + return prev; +} + /* Is there a specific entry in the list? */ int slist_contains (slist *l, const char *s) @@ -964,11 +1003,9 @@ slist_contains (slist *l, const char *s) void slist_free (slist *l) { - slist *n; - while (l) { - n = l->next; + slist *n = l->next; free (l->string); free (l); l = n; @@ -983,13 +1020,21 @@ slist_free (slist *l) void string_set_add (struct hash_table *ht, const char *s) { + /* First check whether the set element already exists. If it does, + do nothing so that we don't have to free() the old element and + then strdup() a new one. */ + if (hash_table_exists (ht, s)) + return; + /* We use "1" as value. It provides us a useful and clear arbitrary value, and it consumes no memory -- the pointers to the same - string "1" will be shared by all the key-value pairs in the hash - table. */ + string "1" will be shared by all the key-value pairs in all `set' + hash tables. */ hash_table_put (ht, xstrdup (s), "1"); } +/* Synonym for hash_table_exists... */ + int string_set_exists (struct hash_table *ht, const char *s) { diff --git a/src/utils.h b/src/utils.h index c9de38bb..e4ac6f1d 100644 --- a/src/utils.h +++ b/src/utils.h @@ -67,6 +67,8 @@ void read_file_free PARAMS ((struct file_memory *)); void free_vec PARAMS ((char **)); char **merge_vecs PARAMS ((char **, char **)); slist *slist_append PARAMS ((slist *, const char *)); +slist *slist_prepend PARAMS ((slist *, const char *)); +slist *slist_nreverse PARAMS ((slist *)); int slist_contains PARAMS ((slist *, const char *)); void slist_free PARAMS ((slist *));