mirror of
https://github.com/moparisthebest/wallabag
synced 2024-11-23 17:42:15 -05:00
Merge pull request #707 from mariroz/dev
update to 3.2 version of full-text-rss, issue #694
This commit is contained in:
commit
87f01ea2e9
58
inc/3rdparty/config.php
vendored
58
inc/3rdparty/config.php
vendored
@ -47,6 +47,60 @@ $options->default_entries = 5;
|
||||
// 10, only 10 will be processed.
|
||||
$options->max_entries = 10;
|
||||
|
||||
// Full content
|
||||
// ----------------------
|
||||
// By default Full-Text RSS includes the extracted content in the output.
|
||||
// You can exclude this from the output by passing '&content=0' in the querystring.
|
||||
//
|
||||
// Possible values...
|
||||
// Always include: true
|
||||
// Never include: false
|
||||
// Include unless user overrides (&content=0): 'user' (default)
|
||||
//
|
||||
// Note: currently this does not disable full content extraction. It simply omits it
|
||||
// from the output.
|
||||
$options->content = 'user';
|
||||
|
||||
// Excerpts
|
||||
// ----------------------
|
||||
// By default Full-Text RSS does not include excerpts in the output.
|
||||
// You can enable this by passing '&summary=1' in the querystring.
|
||||
// This will include a plain text excerpt from the extracted content.
|
||||
//
|
||||
// Possible values...
|
||||
// Always include: true (recommended for new users)
|
||||
// Never include: false
|
||||
// Don't include unless user overrides (&summary=1): 'user' (default)
|
||||
//
|
||||
// Important: if both content and excerpts are requested, the excerpt will be
|
||||
// placed in the description element and the full content inside content:encoded.
|
||||
// If excerpts are not requested, the full content will go inside the description element.
|
||||
//
|
||||
// Why are we not returning both excerpts and content by default?
|
||||
// Mainly for backward compatibility.
|
||||
// Excerpts should appear in the feed item's description element. Previous versions
|
||||
// of Full-Text RSS did not return excerpts, so the description element was always
|
||||
// used for the full content (as recommended by the RSS advisory). When returning both,
|
||||
// we need somewhere else to place the content (content:encoded).
|
||||
// Having both enabled should not create any problems for news readers, but it may create
|
||||
// problems for developers upgrading from one of our earlier versions who may now find
|
||||
// their applications are returning excerpts instead of the full content they were
|
||||
// expecting. To avoid such surprises for users who are upgrading Full-Text RSS,
|
||||
// excerpts must be explicitly requested in the querystring by default.
|
||||
//
|
||||
// Why not use a different element name for excerpts?
|
||||
// According to the RSS advisory:
|
||||
// "Publishers who employ summaries should store the summary in description and
|
||||
// the full content in content:encoded, ordering description first within the item.
|
||||
// On items with no summary, the full content should be stored in description."
|
||||
// See: http://www.rssboard.org/rss-profile#namespace-elements-content-encoded
|
||||
//
|
||||
// For more consistent element naming, we recommend new users set this option to true.
|
||||
// The full content can still be excluded via the querystring, but the element names
|
||||
// will not change: when $options->summary = true, the description element will always
|
||||
// be reserved for the excerpt and content:encoded always for full content.
|
||||
$options->summary = 'user';
|
||||
|
||||
// Rewrite relative URLs
|
||||
// ----------------------
|
||||
// With this enabled relative URLs found in the extracted content
|
||||
@ -149,7 +203,7 @@ $options->registration_key = '';
|
||||
// If overriding with an environment variable, separate username and password with a colon, e.g.:
|
||||
// ftr_admin_credentials: admin:my-secret-password
|
||||
// Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password');
|
||||
$options->admin_credentials = array('username'=>'admin', 'password'=>'admin');
|
||||
$options->admin_credentials = array('username'=>'admin', 'password'=>'');
|
||||
|
||||
// URLs to allow
|
||||
// ----------------------
|
||||
@ -375,7 +429,7 @@ $options->cache_cleanup = 100;
|
||||
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
|
||||
/////////////////////////////////////////////////
|
||||
|
||||
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.1');
|
||||
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.2');
|
||||
|
||||
if (basename(__FILE__) == 'config.php') {
|
||||
if (file_exists(dirname(__FILE__).'/custom_config.php')) {
|
||||
|
@ -230,7 +230,7 @@ class ContentExtractor
|
||||
$this->debug("...XPath match: $pattern");
|
||||
// remove title from document
|
||||
try {
|
||||
$elems->item(0)->parentNode->removeChild($elems->item(0));
|
||||
@$elems->item(0)->parentNode->removeChild($elems->item(0));
|
||||
} catch (DOMException $e) {
|
||||
// do nothing
|
||||
}
|
||||
@ -725,4 +725,3 @@ class ContentExtractor
|
||||
return $this->nextPageUrl;
|
||||
}
|
||||
}
|
||||
?>
|
@ -5,10 +5,10 @@
|
||||
* Each instance of this class should hold extraction patterns and other directives
|
||||
* for a website. See ContentExtractor class to see how it's used.
|
||||
*
|
||||
* @version 0.7
|
||||
* @date 2012-08-27
|
||||
* @version 0.8
|
||||
* @date 2013-04-16
|
||||
* @author Keyvan Minoukadeh
|
||||
* @copyright 2012 Keyvan Minoukadeh
|
||||
* @copyright 2013 Keyvan Minoukadeh
|
||||
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
|
||||
*/
|
||||
|
||||
@ -180,7 +180,7 @@ class SiteConfig
|
||||
|
||||
public function append(SiteConfig $newconfig) {
|
||||
// check for commands where we accept multiple statements (no test_url)
|
||||
foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) {
|
||||
foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) {
|
||||
// append array elements for this config variable from $newconfig to this config
|
||||
//$this->$var = $this->$var + $newconfig->$var;
|
||||
$this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
|
||||
@ -190,6 +190,12 @@ class SiteConfig
|
||||
foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
|
||||
if ($this->$var === null) $this->$var = $newconfig->$var;
|
||||
}
|
||||
// treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
|
||||
foreach (array('find_string', 'replace_string') as $var) {
|
||||
// append array elements for this config variable from $newconfig to this config
|
||||
//$this->$var = $this->$var + $newconfig->$var;
|
||||
$this->$var = array_merge($this->$var, $newconfig->$var);
|
||||
}
|
||||
}
|
||||
|
||||
// returns SiteConfig instance if an appropriate one is found, false otherwise
|
||||
@ -335,4 +341,3 @@ class SiteConfig
|
||||
return $config;
|
||||
}
|
||||
}
|
||||
?>
|
10
inc/3rdparty/libraries/feedwriter/FeedItem.php
vendored
Normal file → Executable file
10
inc/3rdparty/libraries/feedwriter/FeedItem.php
vendored
Normal file → Executable file
@ -101,7 +101,8 @@
|
||||
*/
|
||||
public function setDescription($description)
|
||||
{
|
||||
$this->setElement('description', $description);
|
||||
$tag = ($this->version == ATOM)? 'summary' : 'description';
|
||||
$this->setElement($tag, $description);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -129,7 +130,12 @@
|
||||
$date = strtotime($date);
|
||||
}
|
||||
|
||||
if($this->version == RSS2)
|
||||
if($this->version == ATOM)
|
||||
{
|
||||
$tag = 'updated';
|
||||
$value = date(DATE_ATOM, $date);
|
||||
}
|
||||
elseif($this->version == RSS2)
|
||||
{
|
||||
$tag = 'pubDate';
|
||||
$value = date(DATE_RSS, $date);
|
||||
|
17
inc/3rdparty/libraries/feedwriter/FeedWriter.php
vendored
17
inc/3rdparty/libraries/feedwriter/FeedWriter.php
vendored
@ -97,15 +97,12 @@ define('JSONP', 3, true);
|
||||
header('X-content-type-options: nosniff');
|
||||
} elseif ($this->version == JSON) {
|
||||
header('Content-type: application/json; charset=UTF-8');
|
||||
$this->json = new stdClass();
|
||||
} elseif ($this->version == JSONP) {
|
||||
header('Content-type: application/javascript; charset=UTF-8');
|
||||
}
|
||||
}
|
||||
|
||||
if ($this->version == JSON || $this->version == JSONP) {
|
||||
$this->json = new stdClass();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
$this->printHead();
|
||||
$this->printChannels();
|
||||
@ -116,6 +113,11 @@ define('JSONP', 3, true);
|
||||
}
|
||||
}
|
||||
|
||||
public function &getItems()
|
||||
{
|
||||
return $this->items;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new FeedItem.
|
||||
*
|
||||
@ -199,7 +201,8 @@ define('JSONP', 3, true);
|
||||
*/
|
||||
public function setDescription($description)
|
||||
{
|
||||
$this->setChannelElement('description', $description);
|
||||
$tag = ($this->version == ATOM)? 'subtitle' : 'description';
|
||||
$this->setChannelElement($tag, $desciption);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -244,7 +247,7 @@ define('JSONP', 3, true);
|
||||
{
|
||||
$out = '<?xml version="1.0" encoding="utf-8"?>'."\n";
|
||||
if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
|
||||
$out .= '<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
|
||||
$out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
|
||||
echo $out;
|
||||
}
|
||||
elseif ($this->version == JSON || $this->version == JSONP)
|
||||
|
13
inc/3rdparty/libraries/html5/TreeBuilder.php
vendored
13
inc/3rdparty/libraries/html5/TreeBuilder.php
vendored
@ -134,6 +134,7 @@ class HTML5_TreeBuilder {
|
||||
|
||||
// Namespaces for foreign content
|
||||
const NS_HTML = null; // to prevent DOM from requiring NS on everything
|
||||
const NS_XHTML = 'http://www.w3.org/1999/xhtml';
|
||||
const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
|
||||
const NS_SVG = 'http://www.w3.org/2000/svg';
|
||||
const NS_XLINK = 'http://www.w3.org/1999/xlink';
|
||||
@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder {
|
||||
}
|
||||
|
||||
private function insertElement($token, $append = true) {
|
||||
$el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
|
||||
//$el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
|
||||
$namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML;
|
||||
$el = $this->dom->createElementNS($namespaceURI, $token['name']);
|
||||
|
||||
if (!empty($token['attr'])) {
|
||||
foreach($token['attr'] as $attr) {
|
||||
if(!$el->hasAttribute($attr['name'])) {
|
||||
|
||||
// mike@macgirvin.com 2011-11-17, check attribute name for
|
||||
// validity (ignoring extenders and combiners) as illegal chars in names
|
||||
// causes everything to abort
|
||||
|
||||
$valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']);
|
||||
if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) {
|
||||
$el->setAttribute($attr['name'], $attr['value']);
|
||||
}
|
||||
}
|
||||
|
@ -401,4 +401,3 @@ class CookieJar
|
||||
return false;
|
||||
}
|
||||
}
|
||||
?>
|
@ -7,11 +7,11 @@
|
||||
* For environments which do not have these options, it reverts to standard sequential
|
||||
* requests (using file_get_contents())
|
||||
*
|
||||
* @version 1.1
|
||||
* @date 2012-08-20
|
||||
* @version 1.4
|
||||
* @date 2013-05-10
|
||||
* @see http://php.net/HttpRequestPool
|
||||
* @author Keyvan Minoukadeh
|
||||
* @copyright 2011-2012 Keyvan Minoukadeh
|
||||
* @copyright 2011-2013 Keyvan Minoukadeh
|
||||
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
|
||||
*/
|
||||
|
||||
@ -22,7 +22,7 @@ class HumbleHttpAgent
|
||||
const METHOD_FILE_GET_CONTENTS = 4;
|
||||
//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
|
||||
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
|
||||
const UA_PHP = 'PHP/5.2';
|
||||
const UA_PHP = 'PHP/5.4';
|
||||
const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
|
||||
|
||||
protected $requests = array();
|
||||
@ -82,6 +82,8 @@ class HumbleHttpAgent
|
||||
// set request options (redirect must be 0)
|
||||
$this->requestOptions = array(
|
||||
'timeout' => 15,
|
||||
'connecttimeout' => 15,
|
||||
'dns_cache_timeout' => 300,
|
||||
'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
|
||||
// TODO: test onprogress?
|
||||
);
|
||||
@ -155,6 +157,37 @@ class HumbleHttpAgent
|
||||
return $iri->get_iri();
|
||||
}
|
||||
|
||||
public function getRedirectURLfromHTML($url, $html) {
|
||||
$redirect_url = $this->getMetaRefreshURL($url, $html);
|
||||
if (!$redirect_url) {
|
||||
$redirect_url = $this->getUglyURL($url, $html);
|
||||
}
|
||||
return $redirect_url;
|
||||
}
|
||||
|
||||
public function getMetaRefreshURL($url, $html) {
|
||||
if ($html == '') return false;
|
||||
// <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
|
||||
if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) {
|
||||
return false;
|
||||
}
|
||||
$redirect_url = $match[1];
|
||||
if (preg_match('!^https?://!i', $redirect_url)) {
|
||||
// already absolute
|
||||
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
|
||||
return $redirect_url;
|
||||
}
|
||||
// absolutize redirect URL
|
||||
$base = new SimplePie_IRI($url);
|
||||
// remove '//' in URL path (causes URLs not to resolve properly)
|
||||
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
|
||||
if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
|
||||
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
|
||||
return $absolute;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public function getUglyURL($url, $html) {
|
||||
if ($html == '') return false;
|
||||
$found = false;
|
||||
@ -173,7 +206,9 @@ class HumbleHttpAgent
|
||||
}
|
||||
$query['_escaped_fragment_'] = '';
|
||||
$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
|
||||
return $iri->get_iri();
|
||||
$ugly_url = $iri->get_iri();
|
||||
$this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url);
|
||||
return $ugly_url;
|
||||
}
|
||||
|
||||
public function removeFragment($url) {
|
||||
@ -339,9 +374,8 @@ class HumbleHttpAgent
|
||||
// for AJAX sites, e.g. Blogger with its dynamic views templates.
|
||||
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
|
||||
if (isset($this->requests[$orig]['body'])) {
|
||||
$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
|
||||
$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
|
||||
if ($redirectURL) {
|
||||
$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
}
|
||||
}
|
||||
@ -464,9 +498,8 @@ class HumbleHttpAgent
|
||||
// for AJAX sites, e.g. Blogger with its dynamic views templates.
|
||||
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
|
||||
if (isset($this->requests[$orig]['body'])) {
|
||||
$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
|
||||
$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
|
||||
if ($redirectURL) {
|
||||
$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
}
|
||||
}
|
||||
@ -551,9 +584,8 @@ class HumbleHttpAgent
|
||||
// for AJAX sites, e.g. Blogger with its dynamic views templates.
|
||||
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
|
||||
if (isset($this->requests[$orig]['body'])) {
|
||||
$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
|
||||
$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
|
||||
if ($redirectURL) {
|
||||
$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
}
|
||||
}
|
||||
@ -776,4 +808,3 @@ if (!function_exists('gzdecode')) {
|
||||
return $data;
|
||||
}
|
||||
}
|
||||
?>
|
@ -76,4 +76,3 @@ class SimplePie_HumbleHttpAgent extends SimplePie_File
|
||||
}
|
||||
}
|
||||
}
|
||||
?>
|
File diff suppressed because it is too large
Load Diff
57
inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
vendored
Normal file
57
inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
vendored
Normal file
@ -0,0 +1,57 @@
|
||||
<?php
|
||||
class Text_LanguageDetect_Exception extends Exception
|
||||
{
|
||||
/**
|
||||
* Database file could not be found
|
||||
*/
|
||||
const DB_NOT_FOUND = 10;
|
||||
|
||||
/**
|
||||
* Database file found, but not readable
|
||||
*/
|
||||
const DB_NOT_READABLE = 11;
|
||||
|
||||
/**
|
||||
* Database file is empty
|
||||
*/
|
||||
const DB_EMPTY = 12;
|
||||
|
||||
/**
|
||||
* Database contents is not a PHP array
|
||||
*/
|
||||
const DB_NOT_ARRAY = 13;
|
||||
|
||||
/**
|
||||
* Magic quotes are activated
|
||||
*/
|
||||
const MAGIC_QUOTES = 14;
|
||||
|
||||
|
||||
/**
|
||||
* Parameter of invalid type passed to method
|
||||
*/
|
||||
const PARAM_TYPE = 20;
|
||||
|
||||
/**
|
||||
* Character in parameter is invalid
|
||||
*/
|
||||
const INVALID_CHAR = 21;
|
||||
|
||||
|
||||
/**
|
||||
* Language is not in the database
|
||||
*/
|
||||
const UNKNOWN_LANGUAGE = 30;
|
||||
|
||||
|
||||
/**
|
||||
* Error during block detection
|
||||
*/
|
||||
const BLOCK_DETECTION = 40;
|
||||
|
||||
|
||||
/**
|
||||
* Error while clustering languages
|
||||
*/
|
||||
const NO_HIGHEST_KEY = 50;
|
||||
}
|
339
inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
vendored
Normal file
339
inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
vendored
Normal file
@ -0,0 +1,339 @@
|
||||
<?php
|
||||
/**
|
||||
* Part of Text_LanguageDetect
|
||||
*
|
||||
* PHP version 5
|
||||
*
|
||||
* @category Text
|
||||
* @package Text_LanguageDetect
|
||||
* @author Christian Weiske <cweiske@php.net>
|
||||
* @copyright 2011 Christian Weiske <cweiske@php.net>
|
||||
* @license http://www.debian.org/misc/bsd.license BSD
|
||||
* @version SVN: $Id$
|
||||
* @link http://pear.php.net/package/Text_LanguageDetect/
|
||||
*/
|
||||
|
||||
/**
|
||||
* Provides a mapping between the languages from lang.dat and the
|
||||
* ISO 639-1 and ISO-639-2 codes.
|
||||
*
|
||||
* Note that this class contains only languages that exist in lang.dat.
|
||||
*
|
||||
* @category Text
|
||||
* @package Text_LanguageDetect
|
||||
* @author Christian Weiske <cweiske@php.net>
|
||||
* @copyright 2011 Christian Weiske <cweiske@php.net>
|
||||
* @license http://www.debian.org/misc/bsd.license BSD
|
||||
* @link http://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||
*/
|
||||
class Text_LanguageDetect_ISO639
|
||||
{
|
||||
/**
|
||||
* Maps all language names from the language database to the
|
||||
* ISO 639-1 2-letter language code.
|
||||
*
|
||||
* NULL indicates that there is no 2-letter code.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public static $nameToCode2 = array(
|
||||
'albanian' => 'sq',
|
||||
'arabic' => 'ar',
|
||||
'azeri' => 'az',
|
||||
'bengali' => 'bn',
|
||||
'bulgarian' => 'bg',
|
||||
'cebuano' => null,
|
||||
'croatian' => 'hr',
|
||||
'czech' => 'cs',
|
||||
'danish' => 'da',
|
||||
'dutch' => 'nl',
|
||||
'english' => 'en',
|
||||
'estonian' => 'et',
|
||||
'farsi' => 'fa',
|
||||
'finnish' => 'fi',
|
||||
'french' => 'fr',
|
||||
'german' => 'de',
|
||||
'hausa' => 'ha',
|
||||
'hawaiian' => null,
|
||||
'hindi' => 'hi',
|
||||
'hungarian' => 'hu',
|
||||
'icelandic' => 'is',
|
||||
'indonesian' => 'id',
|
||||
'italian' => 'it',
|
||||
'kazakh' => 'kk',
|
||||
'kyrgyz' => 'ky',
|
||||
'latin' => 'la',
|
||||
'latvian' => 'lv',
|
||||
'lithuanian' => 'lt',
|
||||
'macedonian' => 'mk',
|
||||
'mongolian' => 'mn',
|
||||
'nepali' => 'ne',
|
||||
'norwegian' => 'no',
|
||||
'pashto' => 'ps',
|
||||
'pidgin' => null,
|
||||
'polish' => 'pl',
|
||||
'portuguese' => 'pt',
|
||||
'romanian' => 'ro',
|
||||
'russian' => 'ru',
|
||||
'serbian' => 'sr',
|
||||
'slovak' => 'sk',
|
||||
'slovene' => 'sl',
|
||||
'somali' => 'so',
|
||||
'spanish' => 'es',
|
||||
'swahili' => 'sw',
|
||||
'swedish' => 'sv',
|
||||
'tagalog' => 'tl',
|
||||
'turkish' => 'tr',
|
||||
'ukrainian' => 'uk',
|
||||
'urdu' => 'ur',
|
||||
'uzbek' => 'uz',
|
||||
'vietnamese' => 'vi',
|
||||
'welsh' => 'cy',
|
||||
);
|
||||
|
||||
/**
|
||||
* Maps all language names from the language database to the
|
||||
* ISO 639-2 3-letter language code.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public static $nameToCode3 = array(
|
||||
'albanian' => 'sqi',
|
||||
'arabic' => 'ara',
|
||||
'azeri' => 'aze',
|
||||
'bengali' => 'ben',
|
||||
'bulgarian' => 'bul',
|
||||
'cebuano' => 'ceb',
|
||||
'croatian' => 'hrv',
|
||||
'czech' => 'ces',
|
||||
'danish' => 'dan',
|
||||
'dutch' => 'nld',
|
||||
'english' => 'eng',
|
||||
'estonian' => 'est',
|
||||
'farsi' => 'fas',
|
||||
'finnish' => 'fin',
|
||||
'french' => 'fra',
|
||||
'german' => 'deu',
|
||||
'hausa' => 'hau',
|
||||
'hawaiian' => 'haw',
|
||||
'hindi' => 'hin',
|
||||
'hungarian' => 'hun',
|
||||
'icelandic' => 'isl',
|
||||
'indonesian' => 'ind',
|
||||
'italian' => 'ita',
|
||||
'kazakh' => 'kaz',
|
||||
'kyrgyz' => 'kir',
|
||||
'latin' => 'lat',
|
||||
'latvian' => 'lav',
|
||||
'lithuanian' => 'lit',
|
||||
'macedonian' => 'mkd',
|
||||
'mongolian' => 'mon',
|
||||
'nepali' => 'nep',
|
||||
'norwegian' => 'nor',
|
||||
'pashto' => 'pus',
|
||||
'pidgin' => 'crp',
|
||||
'polish' => 'pol',
|
||||
'portuguese' => 'por',
|
||||
'romanian' => 'ron',
|
||||
'russian' => 'rus',
|
||||
'serbian' => 'srp',
|
||||
'slovak' => 'slk',
|
||||
'slovene' => 'slv',
|
||||
'somali' => 'som',
|
||||
'spanish' => 'spa',
|
||||
'swahili' => 'swa',
|
||||
'swedish' => 'swe',
|
||||
'tagalog' => 'tgl',
|
||||
'turkish' => 'tur',
|
||||
'ukrainian' => 'ukr',
|
||||
'urdu' => 'urd',
|
||||
'uzbek' => 'uzb',
|
||||
'vietnamese' => 'vie',
|
||||
'welsh' => 'cym',
|
||||
);
|
||||
|
||||
/**
|
||||
* Maps ISO 639-1 2-letter language codes to the language names
|
||||
* in the language database
|
||||
*
|
||||
* Not all languages have a 2 letter code, so some are missing
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public static $code2ToName = array(
|
||||
'ar' => 'arabic',
|
||||
'az' => 'azeri',
|
||||
'bg' => 'bulgarian',
|
||||
'bn' => 'bengali',
|
||||
'cs' => 'czech',
|
||||
'cy' => 'welsh',
|
||||
'da' => 'danish',
|
||||
'de' => 'german',
|
||||
'en' => 'english',
|
||||
'es' => 'spanish',
|
||||
'et' => 'estonian',
|
||||
'fa' => 'farsi',
|
||||
'fi' => 'finnish',
|
||||
'fr' => 'french',
|
||||
'ha' => 'hausa',
|
||||
'hi' => 'hindi',
|
||||
'hr' => 'croatian',
|
||||
'hu' => 'hungarian',
|
||||
'id' => 'indonesian',
|
||||
'is' => 'icelandic',
|
||||
'it' => 'italian',
|
||||
'kk' => 'kazakh',
|
||||
'ky' => 'kyrgyz',
|
||||
'la' => 'latin',
|
||||
'lt' => 'lithuanian',
|
||||
'lv' => 'latvian',
|
||||
'mk' => 'macedonian',
|
||||
'mn' => 'mongolian',
|
||||
'ne' => 'nepali',
|
||||
'nl' => 'dutch',
|
||||
'no' => 'norwegian',
|
||||
'pl' => 'polish',
|
||||
'ps' => 'pashto',
|
||||
'pt' => 'portuguese',
|
||||
'ro' => 'romanian',
|
||||
'ru' => 'russian',
|
||||
'sk' => 'slovak',
|
||||
'sl' => 'slovene',
|
||||
'so' => 'somali',
|
||||
'sq' => 'albanian',
|
||||
'sr' => 'serbian',
|
||||
'sv' => 'swedish',
|
||||
'sw' => 'swahili',
|
||||
'tl' => 'tagalog',
|
||||
'tr' => 'turkish',
|
||||
'uk' => 'ukrainian',
|
||||
'ur' => 'urdu',
|
||||
'uz' => 'uzbek',
|
||||
'vi' => 'vietnamese',
|
||||
);
|
||||
|
||||
/**
|
||||
* Maps ISO 639-2 3-letter language codes to the language names
|
||||
* in the language database.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
public static $code3ToName = array(
|
||||
'ara' => 'arabic',
|
||||
'aze' => 'azeri',
|
||||
'ben' => 'bengali',
|
||||
'bul' => 'bulgarian',
|
||||
'ceb' => 'cebuano',
|
||||
'ces' => 'czech',
|
||||
'crp' => 'pidgin',
|
||||
'cym' => 'welsh',
|
||||
'dan' => 'danish',
|
||||
'deu' => 'german',
|
||||
'eng' => 'english',
|
||||
'est' => 'estonian',
|
||||
'fas' => 'farsi',
|
||||
'fin' => 'finnish',
|
||||
'fra' => 'french',
|
||||
'hau' => 'hausa',
|
||||
'haw' => 'hawaiian',
|
||||
'hin' => 'hindi',
|
||||
'hrv' => 'croatian',
|
||||
'hun' => 'hungarian',
|
||||
'ind' => 'indonesian',
|
||||
'isl' => 'icelandic',
|
||||
'ita' => 'italian',
|
||||
'kaz' => 'kazakh',
|
||||
'kir' => 'kyrgyz',
|
||||
'lat' => 'latin',
|
||||
'lav' => 'latvian',
|
||||
'lit' => 'lithuanian',
|
||||
'mkd' => 'macedonian',
|
||||
'mon' => 'mongolian',
|
||||
'nep' => 'nepali',
|
||||
'nld' => 'dutch',
|
||||
'nor' => 'norwegian',
|
||||
'pol' => 'polish',
|
||||
'por' => 'portuguese',
|
||||
'pus' => 'pashto',
|
||||
'rom' => 'romanian',
|
||||
'rus' => 'russian',
|
||||
'slk' => 'slovak',
|
||||
'slv' => 'slovene',
|
||||
'som' => 'somali',
|
||||
'spa' => 'spanish',
|
||||
'sqi' => 'albanian',
|
||||
'srp' => 'serbian',
|
||||
'swa' => 'swahili',
|
||||
'swe' => 'swedish',
|
||||
'tgl' => 'tagalog',
|
||||
'tur' => 'turkish',
|
||||
'ukr' => 'ukrainian',
|
||||
'urd' => 'urdu',
|
||||
'uzb' => 'uzbek',
|
||||
'vie' => 'vietnamese',
|
||||
);
|
||||
|
||||
/**
|
||||
* Returns the 2-letter ISO 639-1 code for the given language name.
|
||||
*
|
||||
* @param string $lang English language name like "swedish"
|
||||
*
|
||||
* @return string Two-letter language code (e.g. "sv") or NULL if not found
|
||||
*/
|
||||
public static function nameToCode2($lang)
|
||||
{
|
||||
$lang = strtolower($lang);
|
||||
if (!isset(self::$nameToCode2[$lang])) {
|
||||
return null;
|
||||
}
|
||||
return self::$nameToCode2[$lang];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the 3-letter ISO 639-2 code for the given language name.
|
||||
*
|
||||
* @param string $lang English language name like "swedish"
|
||||
*
|
||||
* @return string Three-letter language code (e.g. "swe") or NULL if not found
|
||||
*/
|
||||
public static function nameToCode3($lang)
|
||||
{
|
||||
$lang = strtolower($lang);
|
||||
if (!isset(self::$nameToCode3[$lang])) {
|
||||
return null;
|
||||
}
|
||||
return self::$nameToCode3[$lang];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the language name for the given 2-letter ISO 639-1 code.
|
||||
*
|
||||
* @param string $code Two-letter language code (e.g. "sv")
|
||||
*
|
||||
* @return string English language name like "swedish"
|
||||
*/
|
||||
public static function code2ToName($code)
|
||||
{
|
||||
$lang = strtolower($code);
|
||||
if (!isset(self::$code2ToName[$code])) {
|
||||
return null;
|
||||
}
|
||||
return self::$code2ToName[$code];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the language name for the given 3-letter ISO 639-2 code.
|
||||
*
|
||||
* @param string $code Three-letter language code (e.g. "swe")
|
||||
*
|
||||
* @return string English language name like "swedish"
|
||||
*/
|
||||
public static function code3ToName($code)
|
||||
{
|
||||
$lang = strtolower($code);
|
||||
if (!isset(self::$code3ToName[$code])) {
|
||||
return null;
|
||||
}
|
||||
return self::$code3ToName[$code];
|
||||
}
|
||||
}
|
@ -8,7 +8,7 @@
|
||||
* @author Nicholas Pisarro
|
||||
* @copyright 2006
|
||||
* @license BSD
|
||||
* @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $
|
||||
* @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
|
||||
* @link http://pear.php.net/package/Text_LanguageDetect/
|
||||
* @link http://langdetect.blogspot.com/
|
||||
*/
|
||||
@ -28,7 +28,7 @@
|
||||
* @author Nicholas Pisarro
|
||||
* @copyright 2006
|
||||
* @license BSD
|
||||
* @version release: 0.2.3
|
||||
* @version release: 0.3.0
|
||||
*/
|
||||
class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
{
|
||||
@ -102,21 +102,17 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
* @access private
|
||||
* @param string $string string to be parsed
|
||||
*/
|
||||
function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) {
|
||||
if (isset($db)) $this->_db_filename = $db;
|
||||
if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
|
||||
function Text_LanguageDetect_Parser($string) {
|
||||
$this->_string = $string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if a string is suitable for parsing
|
||||
*
|
||||
* @static
|
||||
* @access public
|
||||
* @param string $str input string to test
|
||||
* @return bool true if acceptable, false if not
|
||||
*/
|
||||
function validateString($str) {
|
||||
public static function validateString($str) {
|
||||
if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
|
||||
return true;
|
||||
} else {
|
||||
@ -222,8 +218,7 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
|
||||
// unicode startup
|
||||
if ($this->_compile_unicode) {
|
||||
$blocks =& $this->_read_unicode_block_db();
|
||||
|
||||
$blocks = $this->_read_unicode_block_db();
|
||||
$block_count = count($blocks);
|
||||
|
||||
$skipped_count = 0;
|
||||
@ -350,5 +345,3 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
}
|
||||
|
||||
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
|
||||
|
||||
?>
|
@ -1059,8 +1059,8 @@ class Readability
|
||||
} else if ( $input > floor($p/3) ) {
|
||||
$this->dbg(' too many <input> elements');
|
||||
$toRemove = true;
|
||||
} else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
|
||||
$this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');
|
||||
} else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
|
||||
$this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images');
|
||||
$toRemove = true;
|
||||
} else if($weight < 25 && $linkDensity > 0.2) {
|
||||
$this->dbg(' weight smaller than 25 and link density above 0.2');
|
||||
|
111
inc/3rdparty/makefulltextfeed.php
vendored
111
inc/3rdparty/makefulltextfeed.php
vendored
@ -3,8 +3,8 @@
|
||||
// Author: Keyvan Minoukadeh
|
||||
// Copyright (c) 2013 Keyvan Minoukadeh
|
||||
// License: AGPLv3
|
||||
// Version: 3.1
|
||||
// Date: 2013-03-05
|
||||
// Version: 3.2
|
||||
// Date: 2013-05-13
|
||||
// More info: http://fivefilters.org/content-only/
|
||||
// Help: http://help.fivefilters.org
|
||||
|
||||
@ -25,12 +25,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
// Usage
|
||||
// -----
|
||||
// Request this file passing it your feed in the querystring: makefulltextfeed.php?url=mysite.org
|
||||
// The following options can be passed in the querystring:
|
||||
// * URL: url=[feed or website url] (required, should be URL-encoded - in php: urlencode($url))
|
||||
// * URL points to HTML (not feed): html=true (optional, by default it's automatically detected)
|
||||
// * API key: key=[api key] (optional, refer to config.php)
|
||||
// * Max entries to process: max=[max number of items] (optional)
|
||||
// Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article
|
||||
// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage
|
||||
|
||||
error_reporting(E_ALL ^ E_NOTICE);
|
||||
ini_set("display_errors", 1);
|
||||
@ -165,6 +161,8 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
|
||||
if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']);
|
||||
if (isset($_GET['xss'])) $redirect .= '&xss';
|
||||
if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title';
|
||||
if (isset($_GET['content'])) $redirect .= '&content='.urlencode($_GET['content']);
|
||||
if (isset($_GET['summary'])) $redirect .= '&summary='.urlencode($_GET['summary']);
|
||||
if (isset($_GET['debug'])) $redirect .= '&debug';
|
||||
if ($debug_mode) {
|
||||
debug('Redirecting to hide access key, follow URL below to continue');
|
||||
@ -250,6 +248,28 @@ if ($options->favour_feed_titles == 'user') {
|
||||
$favour_feed_titles = $options->favour_feed_titles;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Include full content in output?
|
||||
///////////////////////////////////////////////
|
||||
if ($options->content === 'user') {
|
||||
if (isset($_GET['content']) && $_GET['content'] === '0') {
|
||||
$options->content = false;
|
||||
} else {
|
||||
$options->content = true;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Include summaries in output?
|
||||
///////////////////////////////////////////////
|
||||
if ($options->summary === 'user') {
|
||||
if (isset($_GET['summary']) && $_GET['summary'] === '1') {
|
||||
$options->summary = true;
|
||||
} else {
|
||||
$options->summary = false;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Exclude items if extraction fails
|
||||
///////////////////////////////////////////////
|
||||
@ -272,15 +292,6 @@ if ($options->detect_language === 'user') {
|
||||
$detect_language = $options->detect_language;
|
||||
}
|
||||
|
||||
if ($detect_language >= 2) {
|
||||
$language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg',
|
||||
'cebuano' => 'ceb', // ISO 639-2
|
||||
'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha',
|
||||
'hawaiian' => 'haw', // ISO 639-2
|
||||
'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps',
|
||||
'pidgin' => 'cpe', // ISO 639-2
|
||||
'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy');
|
||||
}
|
||||
$use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0);
|
||||
|
||||
/////////////////////////////////////
|
||||
@ -330,7 +341,7 @@ if ($options->cors) header('Access-Control-Allow-Origin: *');
|
||||
//////////////////////////////////
|
||||
if ($options->caching) {
|
||||
debug('Caching is enabled...');
|
||||
$cache_id = md5($max.$url.$valid_key.$links.$favour_feed_titles.$xss_filter.$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));
|
||||
$cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));
|
||||
$check_cache = true;
|
||||
if ($options->apc && $options->smart_cache) {
|
||||
apc_add("cache.$cache_id", 0, 10*60);
|
||||
@ -550,14 +561,33 @@ foreach ($items as $key => $item) {
|
||||
$is_single_page = false;
|
||||
if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
|
||||
$is_single_page = true;
|
||||
$effective_url = $single_page_response['effective_url'];
|
||||
// check if action defined for returned Content-Type
|
||||
$mime_info = get_mime_action_info($single_page_response['headers']);
|
||||
if (isset($mime_info['action'])) {
|
||||
if ($mime_info['action'] == 'exclude') {
|
||||
continue; // skip this feed item entry
|
||||
} elseif ($mime_info['action'] == 'link') {
|
||||
if ($mime_info['type'] == 'image') {
|
||||
$html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"{$mime_info['name']}\" /></a>";
|
||||
} else {
|
||||
$html = "<a href=\"$effective_url\">Download {$mime_info['name']}</a>";
|
||||
}
|
||||
$extracted_title = $mime_info['name'];
|
||||
$do_content_extraction = false;
|
||||
}
|
||||
}
|
||||
if ($do_content_extraction) {
|
||||
$html = $single_page_response['body'];
|
||||
// remove strange things
|
||||
$html = str_replace('</[>', '', $html);
|
||||
$html = convert_to_utf8($html, $single_page_response['headers']);
|
||||
$effective_url = $single_page_response['effective_url'];
|
||||
debug("Retrieved single-page view from $effective_url");
|
||||
}
|
||||
unset($single_page_response);
|
||||
}
|
||||
}
|
||||
if ($do_content_extraction) {
|
||||
debug('--------');
|
||||
debug('Attempting to extract content');
|
||||
$extract_result = $extractor->process($html, $effective_url);
|
||||
@ -567,7 +597,7 @@ foreach ($items as $key => $item) {
|
||||
// Deal with multi-page articles
|
||||
//die('Next: '.$extractor->getNextPageUrl());
|
||||
$is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl());
|
||||
if ($options->multipage && $is_multi_page) {
|
||||
if ($options->multipage && $is_multi_page && $options->content) {
|
||||
debug('--------');
|
||||
debug('Attempting to process multi-page article');
|
||||
$multi_page_urls = array();
|
||||
@ -605,13 +635,15 @@ foreach ($items as $key => $item) {
|
||||
// did we successfully deal with this multi-page article?
|
||||
if (empty($multi_page_content)) {
|
||||
debug('Failed to extract all parts of multi-page article, so not going to include them');
|
||||
$multi_page_content[] = $readability->dom->createElement('p')->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';
|
||||
$_page = $readability->dom->createElement('p');
|
||||
$_page->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';
|
||||
$multi_page_content[] = $_page;
|
||||
}
|
||||
foreach ($multi_page_content as $_page) {
|
||||
$_page = $content_block->ownerDocument->importNode($_page, true);
|
||||
$content_block->appendChild($_page);
|
||||
}
|
||||
unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url);
|
||||
unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url, $_page);
|
||||
}
|
||||
}
|
||||
// use extracted title for both feed and item title if we're using single-item dummy feed
|
||||
@ -658,7 +690,7 @@ foreach ($items as $key => $item) {
|
||||
} else {
|
||||
$html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML
|
||||
}
|
||||
unset($content_block);
|
||||
//unset($content_block);
|
||||
// post-processing cleanup
|
||||
$html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
|
||||
if ($links == 'remove') {
|
||||
@ -681,7 +713,32 @@ foreach ($items as $key => $item) {
|
||||
debug('Filtering HTML to remove XSS');
|
||||
$html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1));
|
||||
}
|
||||
$newitem->setDescription($html);
|
||||
|
||||
// add content
|
||||
if ($options->summary === true) {
|
||||
// get summary
|
||||
$summary = '';
|
||||
if (!$do_content_extraction) {
|
||||
$summary = $html;
|
||||
} else {
|
||||
// Try to get first few paragraphs
|
||||
if (isset($content_block) && ($content_block instanceof DOMElement)) {
|
||||
$_paras = $content_block->getElementsByTagName('p');
|
||||
foreach ($_paras as $_para) {
|
||||
$summary .= preg_replace("/[\n\r\t ]+/", ' ', $_para->textContent).' ';
|
||||
if (strlen($summary) > 200) break;
|
||||
}
|
||||
} else {
|
||||
$summary = $html;
|
||||
}
|
||||
}
|
||||
unset($_paras, $_para);
|
||||
$summary = get_excerpt($summary);
|
||||
$newitem->setDescription($summary);
|
||||
if ($options->content) $newitem->setElement('content:encoded', $html);
|
||||
} else {
|
||||
if ($options->content) $newitem->setDescription($html);
|
||||
}
|
||||
|
||||
// set date
|
||||
if ((int)$item->get_date('U') > 0) {
|
||||
@ -727,11 +784,12 @@ foreach ($items as $key => $item) {
|
||||
//die('what');
|
||||
// Use PEAR's Text_LanguageDetect
|
||||
if (!isset($l)) {
|
||||
$l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat');
|
||||
$l = new Text_LanguageDetect();
|
||||
$l->setNameMode(2); // return ISO 639-1 codes (e.g. "en")
|
||||
}
|
||||
$l_result = $l->detect($text_sample, 1);
|
||||
if (count($l_result) > 0) {
|
||||
$language = $language_codes[key($l_result)];
|
||||
$language = key($l_result);
|
||||
}
|
||||
}
|
||||
} catch (Exception $e) {
|
||||
@ -794,7 +852,6 @@ foreach ($items as $key => $item) {
|
||||
}
|
||||
}
|
||||
}
|
||||
/* } */
|
||||
$output->addItem($newitem);
|
||||
unset($html);
|
||||
$item_count++;
|
||||
|
40
inc/3rdparty/makefulltextfeedHelpers.php
vendored
40
inc/3rdparty/makefulltextfeedHelpers.php
vendored
@ -66,6 +66,38 @@ class DummySingleItem {
|
||||
// HELPER FUNCTIONS
|
||||
///////////////////////////////
|
||||
|
||||
// Adapted from WordPress
|
||||
// http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173
|
||||
function get_excerpt($text, $num_words=55, $more=null) {
|
||||
if (null === $more) $more = '…';
|
||||
$text = strip_tags($text);
|
||||
//TODO: Check if word count is based on single characters (East Asian characters)
|
||||
/*
|
||||
if (1==2) {
|
||||
$text = trim(preg_replace("/[\n\r\t ]+/", ' ', $text), ' ');
|
||||
preg_match_all('/./u', $text, $words_array);
|
||||
$words_array = array_slice($words_array[0], 0, $num_words + 1);
|
||||
$sep = '';
|
||||
} else {
|
||||
$words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
|
||||
$sep = ' ';
|
||||
}
|
||||
*/
|
||||
$words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
|
||||
$sep = ' ';
|
||||
if (count($words_array) > $num_words) {
|
||||
array_pop($words_array);
|
||||
$text = implode($sep, $words_array);
|
||||
$text = $text.$more;
|
||||
} else {
|
||||
$text = implode($sep, $words_array);
|
||||
}
|
||||
// trim whitespace at beginning or end of string
|
||||
// See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2
|
||||
$text = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $text);
|
||||
return $text;
|
||||
}
|
||||
|
||||
function url_allowed($url) {
|
||||
global $options;
|
||||
if (!empty($options->allowed_urls)) {
|
||||
@ -165,14 +197,6 @@ function convert_to_utf8($html, $header=null)
|
||||
if (strtolower($encoding) != 'utf-8') {
|
||||
debug('Converting to UTF-8');
|
||||
$html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
|
||||
/*
|
||||
if (function_exists('iconv')) {
|
||||
// iconv appears to handle certain character encodings better than mb_convert_encoding
|
||||
$html = iconv($encoding, 'utf-8', $html);
|
||||
} else {
|
||||
$html = mb_convert_encoding($html, 'utf-8', $encoding);
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
}
|
||||
|
12
inc/3rdparty/site_config/custom/dailymotion.com.txt
vendored
Executable file
12
inc/3rdparty/site_config/custom/dailymotion.com.txt
vendored
Executable file
@ -0,0 +1,12 @@
|
||||
title: //title
|
||||
body: //iframe
|
||||
|
||||
replace_string(<![CDATA[): _
|
||||
replace_string(]]>): _
|
||||
|
||||
single_page_link: //link[@type='application/xml+oembed']
|
||||
|
||||
prune: no
|
||||
tidy: no
|
||||
|
||||
http://www.dailymotion.com/video/x1vk5oh_before-they-were-on-game-of-thrones_people
|
3
inc/3rdparty/site_config/custom/index.php
vendored
Normal file
3
inc/3rdparty/site_config/custom/index.php
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
<?php
|
||||
// this is here to prevent directory listing over the web
|
||||
?>
|
11
inc/3rdparty/site_config/custom/ted.com.txt
vendored
Executable file
11
inc/3rdparty/site_config/custom/ted.com.txt
vendored
Executable file
@ -0,0 +1,11 @@
|
||||
title: //title
|
||||
body: //div[@class='talk-article__body talk-transcript__body'] | //div[@class='media__image media__image--thumb talk-link__image']
|
||||
|
||||
strip_id_or_class: talk-transcript__para__time
|
||||
|
||||
single_page_link: //a[@id='hero-transcript-link']
|
||||
|
||||
#prune: no
|
||||
tidy: no
|
||||
|
||||
test_url: http://www.ted.com/talks/andrew_solomon_how_the_worst_moments_in_our_lives_make_us_who_we_are
|
1
inc/3rdparty/site_config/index.php
vendored
1
inc/3rdparty/site_config/index.php
vendored
@ -1,3 +1,2 @@
|
||||
<?php
|
||||
// this is here to prevent directory listing over the web
|
||||
?>
|
@ -1 +1 @@
|
||||
4
|
||||
2013-05-12T22:53:07Z
|
@ -1145,7 +1145,8 @@ class Poche
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('Cache.SerializerPath', CACHE);
|
||||
$config->set('HTML.SafeIframe', true);
|
||||
$config->set('URI.SafeIframeRegexp', '%^(https?:)?//(www\.youtube(?:-nocookie)?\.com/embed/|player\.vimeo\.com/video/)%'); //allow YouTube and Vimeo$purifier = new HTMLPurifier($config);
|
||||
//allow YouTube, Vimeo and dailymotion videos
|
||||
$config->set('URI.SafeIframeRegexp', '%^(https?:)?//(www\.youtube(?:-nocookie)?\.com/embed/|player\.vimeo\.com/video/|www\.dailymotion\.com/embed/video/)%');
|
||||
|
||||
return new HTMLPurifier($config);
|
||||
}
|
||||
|
14
themes/default/_search-form.twig
Normal file → Executable file
14
themes/default/_search-form.twig
Normal file → Executable file
@ -7,17 +7,3 @@
|
||||
</p>
|
||||
</form>
|
||||
</div>
|
||||
<script type="text/javascript">
|
||||
$(document).ready(function() {
|
||||
|
||||
$("#search-form").hide();
|
||||
|
||||
$("#search").click(function(){
|
||||
$("#search-form").toggle();
|
||||
$("#search").toggleClass("current");
|
||||
$("#search-arrow").toggleClass("arrow-down");
|
||||
});
|
||||
|
||||
|
||||
});
|
||||
</script>
|
@ -394,6 +394,23 @@ a#bagit-form-close {
|
||||
.add-to-wallabag-link-after:visited {
|
||||
color: #999;
|
||||
}
|
||||
a.add-to-wallabag-link-after {
|
||||
visibility: hidden;
|
||||
position: absolute;
|
||||
opacity: 0;
|
||||
transition-duration: 2s;
|
||||
transition-timing-function: ease-out;
|
||||
}
|
||||
#article article a:hover + a.add-to-wallabag-link-after, a.add-to-wallabag-link-after:hover {
|
||||
opacity: 1;
|
||||
visibility: visible;
|
||||
transition-duration: .3s;
|
||||
transition-timing-function: ease-in;
|
||||
}
|
||||
a.add-to-wallabag-link-after:after {
|
||||
content: "w";
|
||||
}
|
||||
|
||||
|
||||
#add-link-result {
|
||||
display: inline;
|
||||
|
Loading…
Reference in New Issue
Block a user