From e5e3a1ca691c11505a874328f85359e5d06fa760 Mon Sep 17 00:00:00 2001
From: moparisthebest
Date: Tue, 11 Feb 2014 01:02:19 -0600
Subject: [PATCH] Add SphinxQL search support
---
Sources/SearchAPI-Sphinxql.php | 312 +++++++++++++++++++++
sphinx_config.php | 483 +++++++++++++++++++++++++++++++++
2 files changed, 795 insertions(+)
create mode 100644 Sources/SearchAPI-Sphinxql.php
create mode 100644 sphinx_config.php
diff --git a/Sources/SearchAPI-Sphinxql.php b/Sources/SearchAPI-Sphinxql.php
new file mode 100644
index 0000000..66b1f82
--- /dev/null
+++ b/Sources/SearchAPI-Sphinxql.php
@@ -0,0 +1,312 @@
+supported_databases) || empty($modSettings['sphinxql_searchd_port']))
+ {
+ $this->is_supported = false;
+ return;
+ }
+ }
+
+ // Check whether the search can be performed by this API.
+ public function supportsMethod($methodName, $query_params = null)
+ {
+ switch ($methodName)
+ {
+ case 'searchSort':
+ case 'prepareIndexes':
+ case 'indexedWordQuery':
+ case 'searchQuery':
+ return true;
+ break;
+
+ default:
+
+ // All other methods, too bad dunno you.
+ return false;
+ return;
+ }
+ }
+
+ // This function compares the length of two strings plus a little.
+ public function searchSort($a, $b)
+ {
+ global $modSettings, $excludedWords;
+
+ $x = strlen($a) - (in_array($a, $excludedWords) ? 1000 : 0);
+ $y = strlen($b) - (in_array($b, $excludedWords) ? 1000 : 0);
+
+ return $x < $y ? 1 : ($x > $y ? -1 : 0);
+ }
+
+ // Do we have to do some work with the words we are searching for to prepare them?
+ public function prepareIndexes($word, &$wordsSearch, &$wordsExclude, $isExcluded)
+ {
+ global $modSettings;
+
+ $subwords = text2words($word, null, false);
+
+ $fulltextWord = count($subwords) === 1 ? $word : '"' . $word . '"';
+ $wordsSearch['indexed_words'][] = $fulltextWord;
+ if ($isExcluded)
+ $wordsExclude[] = $fulltextWord;
+ }
+
+ // This has it's own custom search.
+ public function searchQuery($search_params, $search_words, $excluded_words, &$participants, &$search_results)
+ {
+ global $user_info, $context, $modSettings;
+
+ // Only request the results if they haven't been cached yet.
+ if (($cached_results = cache_get_data('search_results_' . md5($user_info['query_see_board'] . '_' . $context['params']))) === null)
+ {
+ // Create an instance of the sphinx client and set a few options.
+ $mySphinx = mysql_connect(($modSettings['sphinx_searchd_server'] == 'localhost' ? '127.0.0.1' : $modSettings['sphinx_searchd_server']) . ':' . (int) $modSettings['sphinxql_searchd_port']);
+
+ // Compile different options for our query
+ $query = 'SELECT * FROM smf_index';
+
+ // Construct the (binary mode) query.
+ $where_match = $this->_constructQuery($search_params['search']);
+ // Nothing to search, return zero results
+ if (trim($where_match) == '')
+ return 0;
+
+ if ($search_params['subject_only'])
+ $where_match = '@subject ' . $where_match;
+
+ $query .= ' WHERE MATCH(\'' . $where_match . '\')';
+
+ // Set the limits based on the search parameters.
+ $extra_where = array();
+ if (!empty($search_params['min_msg_id']) || !empty($search_params['max_msg_id']))
+ $extra_where[] = '@id >= ' . $search_params['min_msg_id'] . ' AND @id <=' . (empty($search_params['max_msg_id']) ? (int) $modSettings['maxMsgID'] : $search_params['max_msg_id']);
+ if (!empty($search_params['topic']))
+ $extra_where[] = 'id_topic = ' . (int) $search_params['topic'];
+ if (!empty($search_params['brd']))
+ $extra_where[] = 'id_board IN (' . implode(',', $search_params['brd']) . ')';
+ if (!empty($search_params['memberlist']))
+ $extra_where[] = 'id_member IN (' . implode(',', $search_params['memberlist']) . ')';
+
+ if (!empty($extra_where))
+ $query .= ' AND ' . implode(' AND ', $extra_where);
+
+ // Put together a sort string; besides the main column sort (relevance, id_topic, or num_replies), add secondary sorting based on relevance value (if not the main sort method) and age
+ $sphinx_sort = ($search_params['sort'] === 'id_msg' ? 'id_topic' : $search_params['sort']) . ' ' . strtoupper($search_params['sort_dir']) . ($search_params['sort'] === 'relevance' ? '' : ', relevance desc') . ', poster_time DESC';
+ // Grouping by topic id makes it return only one result per topic, so don't set that for in-topic searches
+ if (empty($search_params['topic']))
+ $query .= ' GROUP BY id_topic WITHIN GROUP ORDER BY ' . $sphinx_sort;
+ $query .= ' ORDER BY ' . $sphinx_sort;
+
+ $query .= ' LIMIT 0,' . (int) $modSettings['sphinx_max_results'];
+
+ // Execute the search query.
+ $request = mysql_query($query, $mySphinx);
+
+ // Can a connection to the daemon be made?
+ if ($request === false)
+ {
+ // Just log the error.
+ if (mysql_error($mySphinx))
+ log_error(mysql_error($mySphinx));
+ fatal_lang_error('error_no_search_daemon');
+ }
+
+ // Get the relevant information from the search results.
+ $cached_results = array(
+ 'matches' => array(),
+ );
+ if (mysql_num_rows($request) != 0)
+ while($match = mysql_fetch_assoc($request))
+ $cached_results['matches'][$match['id']] = array(
+ 'id' => $match['id_topic'],
+ 'relevance' => round($match['relevance'] / 10000, 1) . '%',
+ 'num_matches' => empty($search_params['topic']) ? $match['@count'] : 0,
+ 'matches' => array(),
+ );
+ mysql_free_result($request);
+ mysql_close($mySphinx);
+
+ $cached_results['total'] = count($cached_results['matches']);
+ // Store the search results in the cache.
+ cache_put_data('search_results_' . md5($user_info['query_see_board'] . '_' . $context['params']), $cached_results, 600);
+ }
+
+ $participants = array();
+ foreach (array_slice(array_keys($cached_results['matches']), $_REQUEST['start'], $modSettings['search_results_per_page']) as $msgID)
+ {
+ $context['topics'][$msgID] = $cached_results['matches'][$msgID];
+ $participants[$cached_results['matches'][$msgID]['id']] = false;
+ }
+
+ // Sentences need to be broken up in words for proper highlighting.
+ $search_results = array();
+ foreach ($search_words as $orIndex => $words)
+ $search_results = array_merge($search_results, $search_words[$orIndex]['subject_words']);
+
+ return $cached_results['total'];
+ }
+
+ /**
+ * Constructs a binary mode query to pass back to sphinx
+ *
+ * @param string $string The user entered query to construct with
+ * @return string A binary mode query
+ */
+ function _constructQuery($string)
+ {
+ $keywords = array('include' => array(), 'exclude' => array());
+
+ // Split our search string and return an empty string if no matches
+ if (!preg_match_all('~ (-?)("[^"]+"|[^" ]+)~', ' ' . $string , $tokens, PREG_SET_ORDER))
+ return '';
+
+ // First we split our string into included and excluded words and phrases
+ $or_part = FALSE;
+ foreach ($tokens as $token)
+ {
+ // Strip the quotes off of a phrase
+ if ($token[2][0] == '"')
+ {
+ $token[2] = substr($token[2], 1, -1);
+ $phrase = TRUE;
+ }
+ else
+ $phrase = FALSE;
+
+ // Prepare this token
+ $cleanWords = $this->_cleanString($token[2]);
+
+ // Explode the cleanWords again incase the cleaning put more spaces into it
+ $addWords = $phrase ? array('"' . $cleanWords . '"') : preg_split('~ ~u', $cleanWords, NULL, PREG_SPLIT_NO_EMPTY);
+
+ if ($token[1] == '-')
+ $keywords['exclude'] = array_merge($keywords['exclude'], $addWords);
+
+ // OR'd keywords (we only do this if we have something to OR with)
+ elseif (($token[2] == 'OR' || $token[2] == '|') && count($keywords['include']))
+ {
+ $last = array_pop($keywords['include']);
+ if (!is_array($last))
+ $last = array($last);
+ $keywords['include'][] = $last;
+ $or_part = TRUE;
+ continue;
+ }
+
+ // AND is implied in a Sphinx Search
+ elseif ($token[2] == 'AND' || $token[2] == '&')
+ continue;
+
+ // If this part of the query ended up being blank, skip it
+ elseif (trim($cleanWords) == '')
+ continue;
+
+ // Must be something they want to search for!
+ else
+ {
+ // If this was part of an OR branch, add it to the proper section
+ if ($or_part)
+ $keywords['include'][count($keywords['include']) - 1] = array_merge($keywords['include'][count($keywords['include']) - 1], $addWords);
+ else
+ $keywords['include'] = array_merge($keywords['include'], $addWords);
+ }
+
+ // Start fresh on this...
+ $or_part = FALSE;
+ }
+
+ // Let's make sure they're not canceling each other out
+ if (!count(array_diff($keywords['include'], $keywords['exclude'])))
+ return '';
+
+ // Now we compile our arrays into a valid search string
+ $query_parts = array();
+ foreach ($keywords['include'] as $keyword)
+ $query_parts[] = is_array($keyword) ? '(' . implode(' | ', $keyword) . ')' : $keyword;
+
+ foreach ($keywords['exclude'] as $keyword)
+ $query_parts[] = '-' . $keyword;
+
+ return implode(' ', $query_parts);
+ }
+
+ /**
+ * Cleans a string of everything but alphanumeric characters
+ *
+ * @param string $string A string to clean
+ * @return string A cleaned up string
+ */
+ function _cleanString($string)
+ {
+ global $smcFunc;
+
+ // Decode the entities first
+ $string = html_entity_decode($string, ENT_QUOTES, 'UTF-8');
+
+ // Lowercase string
+ $string = $smcFunc['strtolower']($string);
+
+ // Fix numbers so they search easier (phone numbers, SSN, dates, etc)
+ $string = preg_replace('~([[:digit:]]+)\pP+(?=[[:digit:]])~u', '', $string);
+
+ // Last but not least, strip everything out that's not alphanumeric
+ $string = preg_replace('~[^\pL\pN]+~u', ' ', $string);
+
+ return $string;
+ }
+}
+
+?>
\ No newline at end of file
diff --git a/sphinx_config.php b/sphinx_config.php
new file mode 100644
index 0000000..ec3444e
--- /dev/null
+++ b/sphinx_config.php
@@ -0,0 +1,483 @@
+
+ This configuration tool is designed to guide you through the installation of the Sphinx full-text search engine, specifically for Simple Machines Forum. Following the steps in this tool will tell how to install Sphinx, will configure SMF for using Sphinx, and will create a configuration file that will be needed for Sphinx based on SMF\'s settings. Make sure you have the latest version of this tool, so that the latest improvements have been implemented.
+
+
What is Sphinx?
+
+ Sphinx is an Open Source full-text search engine. It can index texts and find documents within fractions of seconds, a lot faster than MySQL. Sphinx consists of a few components:
+
+ There\'s the indexer that creates the full-text index from the existing tables in MySQL. The indexer is run as a cron job each time, allowing it to update the index once in a while. Based on the configuration file, the indexer knows how to connect to MySQL and which tables it needs to query.
+
+ Another important component is the search deamon (called searchd). This deamon runs as a process and awaits requests for information from the fulltext indexes. External processes, like the webserver, can send a query to it. The search deamon will then consult the index and return the result to the external process.
+
+
+
When should Sphinx be used for Simple Machines Forum?
+
+ Basically Sphinx starts to get interesting when MySQL is unable to do the job of indexing the messages properly. In most cases, a board needs to have at least 300,000 messages before that point has been reached. Also if you want to make sure the search queries don\'t affect the database performance, you can choose to put Sphinx on a different server than the database server.
+
+
+
Requirements for Sphinx
+
+
Root access to the server you\'re installing Sphinx
+
Linux 2.4.x+ / Windows 2000/XP / FreeBSD 4.x+ / NetBSD 1.6 (this tool will assume Linux as operating system)
+ This tool will assume you will be installing Sphinx version ', $sphinx_ver, '. A newer version might be available and, if so, would probably be better. Just understand that the steps below and the working of the search engine might be different in future versions of Sphinx. Please note that Sphinx versions prior to 0.9.9 will not work properly.
+
+
Retrieving and unpacking the package
+
+ Grab the file from the Sphinx website:
+ [~]# wget http://www.sphinxsearch.com/downloads/sphinx-', $sphinx_ver, '.tar.gz
+
+ Untar the package:
+ [~]# tar -xzvf sphinx-', $sphinx_ver, '.tar.gz
+
+ Go to the Sphinx directory:
+ [~]# cd sphinx-', $sphinx_ver, '
+
+
Compiling Sphinx
+ Configure Sphinx (generally no options are needed):
+ [~]# ./configure
+
+ If everything went well, run the make tool:
+ [~]# make
+
+ If that went well too, make the install:
+ [~]# make install
+
+ ';
+}
+
+
+function step_2()
+{
+ global $context, $modSettings, $txt;
+
+ template_sphinx_config_above('Configure SMF for Sphinx');
+
+ echo '
+ A few settings can be configured allowing to customize the search engine. Generally all options can be left untouched.
+
+ ';
+
+ template_sphinx_config_below();
+}
+
+function step_3()
+{
+ global $context, $modSettings, $txt;
+
+ checkSession();
+
+ updateSettings(array(
+ 'sphinx_data_path' => rtrim($_POST['sphinx_data_path'], '/'),
+ 'sphinx_log_path' => rtrim($_POST['sphinx_log_path'], '/'),
+ 'sphinx_stopword_path' => $_POST['sphinx_stopword_path'],
+ 'sphinx_indexer_mem' => (int) $_POST['sphinx_indexer_mem'],
+ 'sphinx_searchd_server' => $_POST['sphinx_searchd_server'],
+ 'sphinx_searchd_port' => (int) $_POST['sphinx_searchd_port'],
+ 'sphinxql_searchd_port' => (int) $_POST['sphinxql_searchd_port'],
+ 'sphinx_max_results' => (int) $_POST['sphinx_max_results'],
+ ));
+
+ if (!isset($modSettings['sphinx_indexed_msg_until']))
+ updateSettings(array(
+ 'sphinx_indexed_msg_until' => '1',
+ ));
+
+
+ template_sphinx_config_above('Configure SMF for Sphinx');
+ echo '
+ Your configuration has been saved successfully. The next time you run this tool, your configuration will automatically be loaded.
+
Generating a configuration file
+ Based on the settings you submitted in the previous screen, this tool can generate a configuration file for you that will be used by Sphinx. Press the button below to generate the configuration file, and upload it to /usr/local/etc/sphinx.conf (default configuration).
+
+
+
+
+ It\'s time to create the full-text index:
+ [~]# indexer --config /usr/local/etc/sphinx.conf --all
+
+ If that went successful, we can test run the search deamon. Start it by typing:
+ [~]# searchd --config /usr/local/etc/sphinx.conf
+
+ If everything worked so far, congratulations, Sphinx has been installed and works! Next step is modifying SMF\'s search to work with Sphinx.
+
+
Configuring SMF
+ Upload the SearchAPI-Sphinxql.php file to the \'Sources\' directory.
+ Select \'Sphinx\' as database index below and press \'Change Search Index\'. Test your search function afterwards, it should work now!
+
+
+
+
+
Creating a cron job for the indexer
+ In order to keep the full-text index up to date, you need to add a cron job that will update the index from time to time. The configuration file defines two indexes: smf_delta_index, an index that only stores the recent changes and can be called frequently. smf_base_index, an index that stores the full database and should be called less frequently.
+
+ Adding the following lines to /etc/crontab would let the index rebuild every day (at 3 am) and update the most recently changed messages each hour:
+ # search indexer
+ 10 3 * * * /usr/local/bin/indexer --config /usr/local/etc/sphinx.conf --rotate smf_base_index
+ 0 * * * * /usr/local/bin/indexer --config /usr/local/etc/sphinx.conf --rotate smf_delta_index
+
+
+ ';
+ template_sphinx_config_below();
+}
+
+function step_888()
+{
+ global $modSettings;
+
+ checkSession();
+
+ if (in_array($_REQUEST['search_index'], array('', 'fulltext', 'custom', 'sphinx', 'sphinxql')))
+ updateSettings(array(
+ 'search_index' => $_REQUEST['search_index'],
+ ));
+
+ echo 'Setting has been saved. This window can be closed.';
+}
+
+
+
+function step_999()
+{
+ global $context, $db_server, $db_name, $db_user, $db_passwd, $db_prefix;
+ global $db_character_set, $modSettings;
+
+ $humungousTopicPosts = 200;
+
+ ob_end_clean();
+ header('Pragma: ');
+ if (!$context['browser']['is_gecko'])
+ header('Content-Transfer-Encoding: binary');
+ header('Connection: close');
+ header('Content-Disposition: attachment; filename="sphinx.conf"');
+ header('Content-Type: application/octet-stream');
+
+ $weight_factors = array(
+ 'age',
+ 'length',
+ 'first_message',
+ 'sticky',
+ );
+ $weight = array();
+ $weight_total = 0;
+ foreach ($weight_factors as $weight_factor)
+ {
+ $weight[$weight_factor] = empty($modSettings['search_weight_' . $weight_factor]) ? 0 : (int) $modSettings['search_weight_' . $weight_factor];
+ $weight_total += $weight[$weight_factor];
+ }
+
+ if ($weight_total === 0)
+ {
+ $weight = array(
+ 'age' => 25,
+ 'length' => 25,
+ 'first_message' => 25,
+ 'sticky' => 25,
+ );
+ $weight_total = 100;
+ }
+
+
+ echo '#
+# Sphinx configuration file (sphinx.conf), configured for SMF 1.1
+#
+# By default the location of this file would probably be:
+# /usr/local/etc/sphinx.conf
+
+source smf_source
+{
+ type = mysql
+ sql_host = ', $db_server, '
+ sql_user = ', $db_user, '
+ sql_pass = ', $db_passwd, '
+ sql_db = ', $db_name, '
+ sql_port = 3306', empty($db_character_set) ? '' : '
+ sql_query_pre = SET NAMES ' . $db_character_set, '
+ sql_query_pre = \
+ REPLACE INTO ', $db_prefix, 'settings (variable, value) \
+ SELECT \'sphinx_indexed_msg_until\', MAX(id_msg) \
+ FROM ', $db_prefix, 'messages
+ sql_query_range = \
+ SELECT 1, value \
+ FROM ', $db_prefix, 'settings \
+ WHERE variable = \'sphinx_indexed_msg_until\'
+ sql_range_step = 1000
+ sql_query = \
+ SELECT \
+ m.id_msg, m.id_topic, m.id_board, IF(m.id_member = 0, 4294967295, m.id_member) AS id_member, m.poster_time, m.body, m.subject, \
+ t.num_replies + 1 AS num_replies, CEILING(1000000 * ( \
+ IF(m.id_msg < 0.7 * s.value, 0, (m.id_msg - 0.7 * s.value) / (0.3 * s.value)) * ' . $weight['age'] . ' + \
+ IF(t.num_replies < 200, t.num_replies / 200, 1) * ' . $weight['length'] . ' + \
+ IF(m.id_msg = t.id_first_msg, 1, 0) * ' . $weight['first_message'] . ' + \
+ IF(t.is_sticky = 0, 0, 1) * ' . $weight['sticky'] . ' \
+ ) / ' . $weight_total . ') AS relevance \
+ FROM ', $db_prefix, 'messages AS m, ', $db_prefix, 'topics AS t, ', $db_prefix, 'settings AS s \
+ WHERE t.id_topic = m.id_topic \
+ AND s.variable = \'maxMsgID\' \
+ AND m.id_msg BETWEEN $start AND $end
+ sql_attr_uint = id_topic
+ sql_attr_uint = id_board
+ sql_attr_uint = id_member
+ sql_attr_timestamp = poster_time
+ sql_attr_timestamp = relevance
+ sql_attr_timestamp = num_replies
+ sql_query_info = \
+ SELECT * \
+ FROM ', $db_prefix, 'messages \
+ WHERE id_msg = $id
+}
+
+source smf_delta_source : smf_source
+{
+ sql_query_pre = ', isset($db_character_set) ? 'SET NAMES ' . $db_character_set : '', '
+ sql_query_range = \
+ SELECT s1.value, s2.value \
+ FROM ', $db_prefix, 'settings AS s1, ', $db_prefix, 'settings AS s2 \
+ WHERE s1.variable = \'sphinx_indexed_msg_until\' \
+ AND s2.variable = \'maxMsgID\'
+}
+
+index smf_base_index
+{
+ html_strip = 1
+ source = smf_source
+ path = ', $modSettings['sphinx_data_path'], '/smf_sphinx_base.index', empty($modSettings['sphinx_stopword_path']) ? '' : '
+ stopwords = ' . $modSettings['sphinx_stopword_path'], '
+ min_word_len = 2
+ charset_type = ', isset($db_character_set) && $db_character_set === 'utf8' ? 'utf-8' : 'sbcs', '
+ charset_table = 0..9, A..Z->a..z, _, a..z
+}
+
+index smf_delta_index : smf_base_index
+{
+ source = smf_delta_source
+ path = ', $modSettings['sphinx_data_path'], '/smf_sphinx_delta.index
+}
+
+index smf_index
+{
+ type = distributed
+ local = smf_base_index
+ local = smf_delta_index
+}
+
+indexer
+{
+ mem_limit = ', (int) $modSettings['sphinx_indexer_mem'], 'M
+}
+
+searchd
+{
+ listen = ', (int) $modSettings['sphinx_searchd_port'], '
+ listen = ', (int) $modSettings['sphinxql_searchd_port'], ':mysql41
+ log = ', $modSettings['sphinx_log_path'], '/searchd.log
+ query_log = ', $modSettings['sphinx_log_path'], '/query.log
+ read_timeout = 5
+ max_children = 30
+ pid_file = ', $modSettings['sphinx_data_path'], '/searchd.pid
+ max_matches = 1000
+}
+';
+
+ flush();
+}
+
+
+
+
+
+function template_sphinx_config_above($title)
+{
+ global $smfsite, $settings;
+
+ echo '
+
+
+ SMF Sphinx Configuration Utility
+
+
+
+
+