/* * cutils.c module. * * Miscellaneous functions to speed up the IMDbPY package. * * Contents: * - pyratcliff(): * Function that implements the Ratcliff-Obershelp comparison * amongst Python strings. * * - pysoundex(): * Return a soundex code string, for the given string. * * Copyright 2004-2009 Davide Alberani * Released under the GPL license. * * NOTE: The Ratcliff-Obershelp part was heavily based on code from the * "simil" Python module. * The "simil" module is copyright of Luca Montecchiani * and can be found here: http://spazioinwind.libero.it/montecchiani/ * It was released under the GPL license; original comments are leaved * below. * */ /*========== Ratcliff-Obershelp ==========*/ /***************************************************************************** * * Stolen code from : * * [Python-Dev] Why is soundex marked obsolete? * by Eric S. Raymond [4]esr@thyrsus.com * on Sun, 14 Jan 2001 14:09:01 -0500 * *****************************************************************************/ /***************************************************************************** * * Ratcliff-Obershelp common-subpattern similarity. * * This code first appeared in a letter to the editor in Doctor * Dobbs's Journal, 11/1988. The original article on the algorithm, * "Pattern Matching by Gestalt" by John Ratcliff, had appeared in the * July 1988 issue (#181) but the algorithm was presented in assembly. * The main drawback of the Ratcliff-Obershelp algorithm is the cost * of the pairwise comparisons. It is significantly more expensive * than stemming, Hamming distance, soundex, and the like. * * Running time quadratic in the data size, memory usage constant. * *****************************************************************************/ #include #define DONTCOMPARE_NULL 0.0 #define DONTCOMPARE_SAME 1.0 #define COMPARE 2.0 #define STRING_MAXLENDIFFER 0.7 /* As of 05 Mar 2008, the longest title is ~600 chars. */ #define MXLINELEN 1023 #define MAX(a,b) ((a) > (b) ? (a) : (b)) //***************************************** // preliminary check.... //***************************************** static float strings_check(char const *s, char const *t) { float threshold; // lenght difference int s_len = strlen(s); // length of s int t_len = strlen(t); // length of t // NULL strings ? if ((t_len * s_len) == 0) return (DONTCOMPARE_NULL); // the same ? if (strcmp(s, t) == 0) return (DONTCOMPARE_SAME); // string lenght difference threshold // we don't want to compare too different lenght strings ;) if (s_len < t_len) threshold = (float) s_len / (float) t_len; else threshold = (float) t_len / (float) s_len; if (threshold < STRING_MAXLENDIFFER) return (DONTCOMPARE_NULL); // proceed return (COMPARE); } static int RatcliffObershelp(char *st1, char *end1, char *st2, char *end2) { register char *a1, *a2; char *b1, *b2; char *s1 = st1, *s2 = st2; /* initializations are just to pacify GCC */ short max, i; if (end1 <= st1 || end2 <= st2) return (0); if (end1 == st1 + 1 && end2 == st2 + 1) return (0); max = 0; b1 = end1; b2 = end2; for (a1 = st1; a1 < b1; a1++) { for (a2 = st2; a2 < b2; a2++) { if (*a1 == *a2) { /* determine length of common substring */ for (i = 1; a1[i] && (a1[i] == a2[i]); i++) continue; if (i > max) { max = i; s1 = a1; s2 = a2; b1 = end1 - max; b2 = end2 - max; } } } } if (!max) return (0); max += RatcliffObershelp(s1 + max, end1, s2 + max, end2); /* rhs */ max += RatcliffObershelp(st1, s1, st2, s2); /* lhs */ return max; } static float ratcliff(char *s1, char *s2) /* compute Ratcliff-Obershelp similarity of two strings */ { int l1, l2; float res; // preliminary tests res = strings_check(s1, s2); if (res != COMPARE) return(res); l1 = strlen(s1); l2 = strlen(s2); return 2.0 * RatcliffObershelp(s1, s1 + l1, s2, s2 + l2) / (l1 + l2); } /* Change a string to lowercase. */ static void strtolower(char *s1) { int i; for (i=0; i < strlen(s1); i++) s1[i] = tolower(s1[i]); } /* Ratcliff-Obershelp for two python strings; returns a python float. */ static PyObject* pyratcliff(PyObject *self, PyObject *pArgs) { char *s1 = NULL; char *s2 = NULL; PyObject *discard = NULL; char s1copy[MXLINELEN+1]; char s2copy[MXLINELEN+1]; /* The optional PyObject parameter is here to be compatible * with the pure python implementation, which uses a * difflib.SequenceMatcher object. */ if (!PyArg_ParseTuple(pArgs, "ss|O", &s1, &s2, &discard)) return NULL; strncpy(s1copy, s1, MXLINELEN); strncpy(s2copy, s2, MXLINELEN); /* Work on copies. */ strtolower(s1copy); strtolower(s2copy); return Py_BuildValue("f", ratcliff(s1copy, s2copy)); } /*========== soundex ==========*/ /* Max length of the soundex code to output (an uppercase char and * _at most_ 4 digits). */ #define SOUNDEX_LEN 5 /* Group Number Lookup Table */ static char soundTable[26] = { 0 /* A */, '1' /* B */, '2' /* C */, '3' /* D */, 0 /* E */, '1' /* F */, '2' /* G */, 0 /* H */, 0 /* I */, '2' /* J */, '2' /* K */, '4' /* L */, '5' /* M */, '5' /* N */, 0 /* O */, '1' /* P */, '2' /* Q */, '6' /* R */, '2' /* S */, '3' /* T */, 0 /* U */, '1' /* V */, 0 /* W */, '2' /* X */, 0 /* Y */, '2' /* Z */}; static PyObject* pysoundex(PyObject *self, PyObject *pArgs) { int i, j, n; char *s = NULL; char word[MXLINELEN+1]; char soundCode[SOUNDEX_LEN+1]; char c; if (!PyArg_ParseTuple(pArgs, "s", &s)) return NULL; j = 0; n = strlen(s); /* Convert to uppercase and exclude non-ascii chars. */ for (i = 0; i < n; i++) { c = toupper(s[i]); if (c < 91 && c > 64) { word[j] = c; j++; } } word[j] = '\0'; n = strlen(word); if (n == 0) { /* If the string is empty, returns None. */ return Py_BuildValue(""); } soundCode[0] = word[0]; /* Build the soundCode string. */ j = 1; for (i = 1; j < SOUNDEX_LEN && i < n; i++) { c = soundTable[(word[i]-65)]; /* Compact zeroes and equal consecutive digits ("12234112"->"123412") */ if (c != 0 && c != soundCode[j-1]) { soundCode[j++] = c; } } soundCode[j] = '\0'; return Py_BuildValue("s", soundCode); } static PyMethodDef cutils_methods[] = { {"ratcliff", pyratcliff, METH_VARARGS, "Ratcliff-Obershelp similarity."}, {"soundex", pysoundex, METH_VARARGS, "Soundex code for strings."}, {NULL} }; void initcutils(void) { Py_InitModule("cutils", cutils_methods); }