mirror of
https://github.com/moparisthebest/SickRage
synced 2024-11-14 21:35:03 -05:00
270 lines
7.0 KiB
C
270 lines
7.0 KiB
C
|
/*
|
||
|
* cutils.c module.
|
||
|
*
|
||
|
* Miscellaneous functions to speed up the IMDbPY package.
|
||
|
*
|
||
|
* Contents:
|
||
|
* - pyratcliff():
|
||
|
* Function that implements the Ratcliff-Obershelp comparison
|
||
|
* amongst Python strings.
|
||
|
*
|
||
|
* - pysoundex():
|
||
|
* Return a soundex code string, for the given string.
|
||
|
*
|
||
|
* Copyright 2004-2009 Davide Alberani <da@erlug.linux.it>
|
||
|
* Released under the GPL license.
|
||
|
*
|
||
|
* NOTE: The Ratcliff-Obershelp part was heavily based on code from the
|
||
|
* "simil" Python module.
|
||
|
* The "simil" module is copyright of Luca Montecchiani <cbm64 _at_ inwind.it>
|
||
|
* and can be found here: http://spazioinwind.libero.it/montecchiani/
|
||
|
* It was released under the GPL license; original comments are leaved
|
||
|
* below.
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
|
||
|
/*========== Ratcliff-Obershelp ==========*/
|
||
|
/*****************************************************************************
|
||
|
*
|
||
|
* Stolen code from :
|
||
|
*
|
||
|
* [Python-Dev] Why is soundex marked obsolete?
|
||
|
* by Eric S. Raymond [4]esr@thyrsus.com
|
||
|
* on Sun, 14 Jan 2001 14:09:01 -0500
|
||
|
*
|
||
|
*****************************************************************************/
|
||
|
|
||
|
/*****************************************************************************
|
||
|
*
|
||
|
* Ratcliff-Obershelp common-subpattern similarity.
|
||
|
*
|
||
|
* This code first appeared in a letter to the editor in Doctor
|
||
|
* Dobbs's Journal, 11/1988. The original article on the algorithm,
|
||
|
* "Pattern Matching by Gestalt" by John Ratcliff, had appeared in the
|
||
|
* July 1988 issue (#181) but the algorithm was presented in assembly.
|
||
|
* The main drawback of the Ratcliff-Obershelp algorithm is the cost
|
||
|
* of the pairwise comparisons. It is significantly more expensive
|
||
|
* than stemming, Hamming distance, soundex, and the like.
|
||
|
*
|
||
|
* Running time quadratic in the data size, memory usage constant.
|
||
|
*
|
||
|
*****************************************************************************/
|
||
|
|
||
|
#include <Python.h>
|
||
|
|
||
|
#define DONTCOMPARE_NULL 0.0
|
||
|
#define DONTCOMPARE_SAME 1.0
|
||
|
#define COMPARE 2.0
|
||
|
#define STRING_MAXLENDIFFER 0.7
|
||
|
|
||
|
/* As of 05 Mar 2008, the longest title is ~600 chars. */
|
||
|
#define MXLINELEN 1023
|
||
|
|
||
|
#define MAX(a,b) ((a) > (b) ? (a) : (b))
|
||
|
|
||
|
|
||
|
//*****************************************
|
||
|
// preliminary check....
|
||
|
//*****************************************
|
||
|
static float
|
||
|
strings_check(char const *s, char const *t)
|
||
|
{
|
||
|
float threshold; // lenght difference
|
||
|
int s_len = strlen(s); // length of s
|
||
|
int t_len = strlen(t); // length of t
|
||
|
|
||
|
// NULL strings ?
|
||
|
if ((t_len * s_len) == 0)
|
||
|
return (DONTCOMPARE_NULL);
|
||
|
|
||
|
// the same ?
|
||
|
if (strcmp(s, t) == 0)
|
||
|
return (DONTCOMPARE_SAME);
|
||
|
|
||
|
// string lenght difference threshold
|
||
|
// we don't want to compare too different lenght strings ;)
|
||
|
if (s_len < t_len)
|
||
|
threshold = (float) s_len / (float) t_len;
|
||
|
else
|
||
|
threshold = (float) t_len / (float) s_len;
|
||
|
if (threshold < STRING_MAXLENDIFFER)
|
||
|
return (DONTCOMPARE_NULL);
|
||
|
|
||
|
// proceed
|
||
|
return (COMPARE);
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
RatcliffObershelp(char *st1, char *end1, char *st2, char *end2)
|
||
|
{
|
||
|
register char *a1, *a2;
|
||
|
char *b1, *b2;
|
||
|
char *s1 = st1, *s2 = st2; /* initializations are just to pacify GCC */
|
||
|
short max, i;
|
||
|
|
||
|
if (end1 <= st1 || end2 <= st2)
|
||
|
return (0);
|
||
|
if (end1 == st1 + 1 && end2 == st2 + 1)
|
||
|
return (0);
|
||
|
|
||
|
max = 0;
|
||
|
b1 = end1;
|
||
|
b2 = end2;
|
||
|
|
||
|
for (a1 = st1; a1 < b1; a1++) {
|
||
|
for (a2 = st2; a2 < b2; a2++) {
|
||
|
if (*a1 == *a2) {
|
||
|
/* determine length of common substring */
|
||
|
for (i = 1; a1[i] && (a1[i] == a2[i]); i++)
|
||
|
continue;
|
||
|
if (i > max) {
|
||
|
max = i;
|
||
|
s1 = a1;
|
||
|
s2 = a2;
|
||
|
b1 = end1 - max;
|
||
|
b2 = end2 - max;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if (!max)
|
||
|
return (0);
|
||
|
max += RatcliffObershelp(s1 + max, end1, s2 + max, end2); /* rhs */
|
||
|
max += RatcliffObershelp(st1, s1, st2, s2); /* lhs */
|
||
|
return max;
|
||
|
}
|
||
|
|
||
|
|
||
|
static float
|
||
|
ratcliff(char *s1, char *s2)
|
||
|
/* compute Ratcliff-Obershelp similarity of two strings */
|
||
|
{
|
||
|
int l1, l2;
|
||
|
float res;
|
||
|
|
||
|
// preliminary tests
|
||
|
res = strings_check(s1, s2);
|
||
|
if (res != COMPARE)
|
||
|
return(res);
|
||
|
|
||
|
l1 = strlen(s1);
|
||
|
l2 = strlen(s2);
|
||
|
|
||
|
return 2.0 * RatcliffObershelp(s1, s1 + l1, s2, s2 + l2) / (l1 + l2);
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Change a string to lowercase. */
|
||
|
static void
|
||
|
strtolower(char *s1)
|
||
|
{
|
||
|
int i;
|
||
|
for (i=0; i < strlen(s1); i++) s1[i] = tolower(s1[i]);
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Ratcliff-Obershelp for two python strings; returns a python float. */
|
||
|
static PyObject*
|
||
|
pyratcliff(PyObject *self, PyObject *pArgs)
|
||
|
{
|
||
|
char *s1 = NULL;
|
||
|
char *s2 = NULL;
|
||
|
PyObject *discard = NULL;
|
||
|
char s1copy[MXLINELEN+1];
|
||
|
char s2copy[MXLINELEN+1];
|
||
|
|
||
|
/* The optional PyObject parameter is here to be compatible
|
||
|
* with the pure python implementation, which uses a
|
||
|
* difflib.SequenceMatcher object. */
|
||
|
if (!PyArg_ParseTuple(pArgs, "ss|O", &s1, &s2, &discard))
|
||
|
return NULL;
|
||
|
|
||
|
strncpy(s1copy, s1, MXLINELEN);
|
||
|
strncpy(s2copy, s2, MXLINELEN);
|
||
|
/* Work on copies. */
|
||
|
strtolower(s1copy);
|
||
|
strtolower(s2copy);
|
||
|
|
||
|
return Py_BuildValue("f", ratcliff(s1copy, s2copy));
|
||
|
}
|
||
|
|
||
|
|
||
|
/*========== soundex ==========*/
|
||
|
/* Max length of the soundex code to output (an uppercase char and
|
||
|
* _at most_ 4 digits). */
|
||
|
#define SOUNDEX_LEN 5
|
||
|
|
||
|
/* Group Number Lookup Table */
|
||
|
static char soundTable[26] =
|
||
|
{ 0 /* A */, '1' /* B */, '2' /* C */, '3' /* D */, 0 /* E */, '1' /* F */,
|
||
|
'2' /* G */, 0 /* H */, 0 /* I */, '2' /* J */, '2' /* K */, '4' /* L */,
|
||
|
'5' /* M */, '5' /* N */, 0 /* O */, '1' /* P */, '2' /* Q */, '6' /* R */,
|
||
|
'2' /* S */, '3' /* T */, 0 /* U */, '1' /* V */, 0 /* W */, '2' /* X */,
|
||
|
0 /* Y */, '2' /* Z */};
|
||
|
|
||
|
static PyObject*
|
||
|
pysoundex(PyObject *self, PyObject *pArgs)
|
||
|
{
|
||
|
int i, j, n;
|
||
|
char *s = NULL;
|
||
|
char word[MXLINELEN+1];
|
||
|
char soundCode[SOUNDEX_LEN+1];
|
||
|
char c;
|
||
|
|
||
|
if (!PyArg_ParseTuple(pArgs, "s", &s))
|
||
|
return NULL;
|
||
|
|
||
|
j = 0;
|
||
|
n = strlen(s);
|
||
|
|
||
|
/* Convert to uppercase and exclude non-ascii chars. */
|
||
|
for (i = 0; i < n; i++) {
|
||
|
c = toupper(s[i]);
|
||
|
if (c < 91 && c > 64) {
|
||
|
word[j] = c;
|
||
|
j++;
|
||
|
}
|
||
|
}
|
||
|
word[j] = '\0';
|
||
|
|
||
|
n = strlen(word);
|
||
|
if (n == 0) {
|
||
|
/* If the string is empty, returns None. */
|
||
|
return Py_BuildValue("");
|
||
|
}
|
||
|
soundCode[0] = word[0];
|
||
|
|
||
|
/* Build the soundCode string. */
|
||
|
j = 1;
|
||
|
for (i = 1; j < SOUNDEX_LEN && i < n; i++) {
|
||
|
c = soundTable[(word[i]-65)];
|
||
|
/* Compact zeroes and equal consecutive digits ("12234112"->"123412") */
|
||
|
if (c != 0 && c != soundCode[j-1]) {
|
||
|
soundCode[j++] = c;
|
||
|
}
|
||
|
}
|
||
|
soundCode[j] = '\0';
|
||
|
|
||
|
return Py_BuildValue("s", soundCode);
|
||
|
}
|
||
|
|
||
|
|
||
|
static PyMethodDef cutils_methods[] = {
|
||
|
{"ratcliff", pyratcliff,
|
||
|
METH_VARARGS, "Ratcliff-Obershelp similarity."},
|
||
|
{"soundex", pysoundex,
|
||
|
METH_VARARGS, "Soundex code for strings."},
|
||
|
{NULL}
|
||
|
};
|
||
|
|
||
|
|
||
|
void
|
||
|
initcutils(void)
|
||
|
{
|
||
|
Py_InitModule("cutils", cutils_methods);
|
||
|
}
|
||
|
|
||
|
|