1
0
mirror of https://github.com/moparisthebest/wallabag synced 2024-11-23 09:32:15 -05:00

update to 3.2 version of full-text-rss, issue #694

This commit is contained in:
Maryana Rozhankivska 2014-05-22 17:16:38 +03:00
parent ab157bbb75
commit 3ec62cf95a
15 changed files with 4417 additions and 4174 deletions

View File

@ -47,6 +47,60 @@ $options->default_entries = 5;
// 10, only 10 will be processed.
$options->max_entries = 10;
// Full content
// ----------------------
// By default Full-Text RSS includes the extracted content in the output.
// You can exclude this from the output by passing '&content=0' in the querystring.
//
// Possible values...
// Always include: true
// Never include: false
// Include unless user overrides (&content=0): 'user' (default)
//
// Note: currently this does not disable full content extraction. It simply omits it
// from the output.
$options->content = 'user';
// Excerpts
// ----------------------
// By default Full-Text RSS does not include excerpts in the output.
// You can enable this by passing '&summary=1' in the querystring.
// This will include a plain text excerpt from the extracted content.
//
// Possible values...
// Always include: true (recommended for new users)
// Never include: false
// Don't include unless user overrides (&summary=1): 'user' (default)
//
// Important: if both content and excerpts are requested, the excerpt will be
// placed in the description element and the full content inside content:encoded.
// If excerpts are not requested, the full content will go inside the description element.
//
// Why are we not returning both excerpts and content by default?
// Mainly for backward compatibility.
// Excerpts should appear in the feed item's description element. Previous versions
// of Full-Text RSS did not return excerpts, so the description element was always
// used for the full content (as recommended by the RSS advisory). When returning both,
// we need somewhere else to place the content (content:encoded).
// Having both enabled should not create any problems for news readers, but it may create
// problems for developers upgrading from one of our earlier versions who may now find
// their applications are returning excerpts instead of the full content they were
// expecting. To avoid such surprises for users who are upgrading Full-Text RSS,
// excerpts must be explicitly requested in the querystring by default.
//
// Why not use a different element name for excerpts?
// According to the RSS advisory:
// "Publishers who employ summaries should store the summary in description and
// the full content in content:encoded, ordering description first within the item.
// On items with no summary, the full content should be stored in description."
// See: http://www.rssboard.org/rss-profile#namespace-elements-content-encoded
//
// For more consistent element naming, we recommend new users set this option to true.
// The full content can still be excluded via the querystring, but the element names
// will not change: when $options->summary = true, the description element will always
// be reserved for the excerpt and content:encoded always for full content.
$options->summary = 'user';
// Rewrite relative URLs
// ----------------------
// With this enabled relative URLs found in the extracted content
@ -149,7 +203,7 @@ $options->registration_key = '';
// If overriding with an environment variable, separate username and password with a colon, e.g.:
// ftr_admin_credentials: admin:my-secret-password
// Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password');
$options->admin_credentials = array('username'=>'admin', 'password'=>'admin');
$options->admin_credentials = array('username'=>'admin', 'password'=>'');
// URLs to allow
// ----------------------
@ -375,7 +429,7 @@ $options->cache_cleanup = 100;
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
/////////////////////////////////////////////////
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.1');
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.2');
if (basename(__FILE__) == 'config.php') {
if (file_exists(dirname(__FILE__).'/custom_config.php')) {

View File

@ -230,7 +230,7 @@ class ContentExtractor
$this->debug("...XPath match: $pattern");
// remove title from document
try {
$elems->item(0)->parentNode->removeChild($elems->item(0));
@$elems->item(0)->parentNode->removeChild($elems->item(0));
} catch (DOMException $e) {
// do nothing
}
@ -725,4 +725,3 @@ class ContentExtractor
return $this->nextPageUrl;
}
}
?>

View File

@ -5,10 +5,10 @@
* Each instance of this class should hold extraction patterns and other directives
* for a website. See ContentExtractor class to see how it's used.
*
* @version 0.7
* @date 2012-08-27
* @version 0.8
* @date 2013-04-16
* @author Keyvan Minoukadeh
* @copyright 2012 Keyvan Minoukadeh
* @copyright 2013 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
@ -180,7 +180,7 @@ class SiteConfig
public function append(SiteConfig $newconfig) {
// check for commands where we accept multiple statements (no test_url)
foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) {
foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) {
// append array elements for this config variable from $newconfig to this config
//$this->$var = $this->$var + $newconfig->$var;
$this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
@ -190,6 +190,12 @@ class SiteConfig
foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
if ($this->$var === null) $this->$var = $newconfig->$var;
}
// treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
foreach (array('find_string', 'replace_string') as $var) {
// append array elements for this config variable from $newconfig to this config
//$this->$var = $this->$var + $newconfig->$var;
$this->$var = array_merge($this->$var, $newconfig->$var);
}
}
// returns SiteConfig instance if an appropriate one is found, false otherwise
@ -335,4 +341,3 @@ class SiteConfig
return $config;
}
}
?>

18
inc/3rdparty/libraries/feedwriter/FeedItem.php vendored Normal file → Executable file
View File

@ -101,7 +101,8 @@
*/
public function setDescription($description)
{
$this->setElement('description', $description);
$tag = ($this->version == ATOM)? 'summary' : 'description';
$this->setElement($tag, $description);
}
/**
@ -129,15 +130,20 @@
$date = strtotime($date);
}
if($this->version == RSS2)
if($this->version == ATOM)
{
$tag = 'pubDate';
$value = date(DATE_RSS, $date);
$tag = 'updated';
$value = date(DATE_ATOM, $date);
}
elseif($this->version == RSS2)
{
$tag = 'pubDate';
$value = date(DATE_RSS, $date);
}
else
{
$tag = 'dc:date';
$value = date("Y-m-d", $date);
$tag = 'dc:date';
$value = date("Y-m-d", $date);
}
$this->setElement($tag, $value);

View File

@ -97,16 +97,13 @@ define('JSONP', 3, true);
header('X-content-type-options: nosniff');
} elseif ($this->version == JSON) {
header('Content-type: application/json; charset=UTF-8');
$this->json = new stdClass();
} elseif ($this->version == JSONP) {
header('Content-type: application/javascript; charset=UTF-8');
$this->json = new stdClass();
}
}
if ($this->version == JSON || $this->version == JSONP) {
$this->json = new stdClass();
}
$this->printHead();
$this->printChannels();
$this->printItems();
@ -116,6 +113,11 @@ define('JSONP', 3, true);
}
}
public function &getItems()
{
return $this->items;
}
/**
* Create a new FeedItem.
*
@ -199,7 +201,8 @@ define('JSONP', 3, true);
*/
public function setDescription($description)
{
$this->setChannelElement('description', $description);
$tag = ($this->version == ATOM)? 'subtitle' : 'description';
$this->setChannelElement($tag, $desciption);
}
/**
@ -244,7 +247,7 @@ define('JSONP', 3, true);
{
$out = '<?xml version="1.0" encoding="utf-8"?>'."\n";
if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
$out .= '<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
$out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
echo $out;
}
elseif ($this->version == JSON || $this->version == JSONP)

View File

@ -134,6 +134,7 @@ class HTML5_TreeBuilder {
// Namespaces for foreign content
const NS_HTML = null; // to prevent DOM from requiring NS on everything
const NS_XHTML = 'http://www.w3.org/1999/xhtml';
const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
const NS_SVG = 'http://www.w3.org/2000/svg';
const NS_XLINK = 'http://www.w3.org/1999/xlink';
@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder {
}
private function insertElement($token, $append = true) {
$el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
//$el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
$namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML;
$el = $this->dom->createElementNS($namespaceURI, $token['name']);
if (!empty($token['attr'])) {
foreach($token['attr'] as $attr) {
if(!$el->hasAttribute($attr['name'])) {
// mike@macgirvin.com 2011-11-17, check attribute name for
// validity (ignoring extenders and combiners) as illegal chars in names
// causes everything to abort
$valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']);
if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) {
$el->setAttribute($attr['name'], $attr['value']);
}
}

View File

@ -401,4 +401,3 @@ class CookieJar
return false;
}
}
?>

View File

@ -7,11 +7,11 @@
* For environments which do not have these options, it reverts to standard sequential
* requests (using file_get_contents())
*
* @version 1.1
* @date 2012-08-20
* @version 1.4
* @date 2013-05-10
* @see http://php.net/HttpRequestPool
* @author Keyvan Minoukadeh
* @copyright 2011-2012 Keyvan Minoukadeh
* @copyright 2011-2013 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
@ -22,7 +22,7 @@ class HumbleHttpAgent
const METHOD_FILE_GET_CONTENTS = 4;
//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
const UA_PHP = 'PHP/5.2';
const UA_PHP = 'PHP/5.4';
const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
protected $requests = array();
@ -82,6 +82,8 @@ class HumbleHttpAgent
// set request options (redirect must be 0)
$this->requestOptions = array(
'timeout' => 15,
'connecttimeout' => 15,
'dns_cache_timeout' => 300,
'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
// TODO: test onprogress?
);
@ -155,6 +157,37 @@ class HumbleHttpAgent
return $iri->get_iri();
}
public function getRedirectURLfromHTML($url, $html) {
$redirect_url = $this->getMetaRefreshURL($url, $html);
if (!$redirect_url) {
$redirect_url = $this->getUglyURL($url, $html);
}
return $redirect_url;
}
public function getMetaRefreshURL($url, $html) {
if ($html == '') return false;
// <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) {
return false;
}
$redirect_url = $match[1];
if (preg_match('!^https?://!i', $redirect_url)) {
// already absolute
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
return $redirect_url;
}
// absolutize redirect URL
$base = new SimplePie_IRI($url);
// remove '//' in URL path (causes URLs not to resolve properly)
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
return $absolute;
}
return false;
}
public function getUglyURL($url, $html) {
if ($html == '') return false;
$found = false;
@ -173,7 +206,9 @@ class HumbleHttpAgent
}
$query['_escaped_fragment_'] = '';
$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
return $iri->get_iri();
$ugly_url = $iri->get_iri();
$this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url);
return $ugly_url;
}
public function removeFragment($url) {
@ -339,9 +374,8 @@ class HumbleHttpAgent
// for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (isset($this->requests[$orig]['body'])) {
$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
if ($redirectURL) {
$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
$this->redirectQueue[$orig] = $redirectURL;
}
}
@ -464,9 +498,8 @@ class HumbleHttpAgent
// for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (isset($this->requests[$orig]['body'])) {
$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
if ($redirectURL) {
$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
$this->redirectQueue[$orig] = $redirectURL;
}
}
@ -551,9 +584,8 @@ class HumbleHttpAgent
// for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (isset($this->requests[$orig]['body'])) {
$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
if ($redirectURL) {
$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
$this->redirectQueue[$orig] = $redirectURL;
}
}
@ -776,4 +808,3 @@ if (!function_exists('gzdecode')) {
return $data;
}
}
?>

View File

@ -76,4 +76,3 @@ class SimplePie_HumbleHttpAgent extends SimplePie_File
}
}
}
?>

File diff suppressed because it is too large Load Diff

View File

@ -1059,8 +1059,8 @@ class Readability
} else if ( $input > floor($p/3) ) {
$this->dbg(' too many <input> elements');
$toRemove = true;
} else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
$this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');
} else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
$this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images');
$toRemove = true;
} else if($weight < 25 && $linkDensity > 0.2) {
$this->dbg(' weight smaller than 25 and link density above 0.2');

View File

@ -3,8 +3,8 @@
// Author: Keyvan Minoukadeh
// Copyright (c) 2013 Keyvan Minoukadeh
// License: AGPLv3
// Version: 3.1
// Date: 2013-03-05
// Version: 3.2
// Date: 2013-05-13
// More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org
@ -25,12 +25,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
// Usage
// -----
// Request this file passing it your feed in the querystring: makefulltextfeed.php?url=mysite.org
// The following options can be passed in the querystring:
// * URL: url=[feed or website url] (required, should be URL-encoded - in php: urlencode($url))
// * URL points to HTML (not feed): html=true (optional, by default it's automatically detected)
// * API key: key=[api key] (optional, refer to config.php)
// * Max entries to process: max=[max number of items] (optional)
// Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article
// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage
error_reporting(E_ALL ^ E_NOTICE);
ini_set("display_errors", 1);
@ -165,6 +161,8 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']);
if (isset($_GET['xss'])) $redirect .= '&xss';
if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title';
if (isset($_GET['content'])) $redirect .= '&content='.urlencode($_GET['content']);
if (isset($_GET['summary'])) $redirect .= '&summary='.urlencode($_GET['summary']);
if (isset($_GET['debug'])) $redirect .= '&debug';
if ($debug_mode) {
debug('Redirecting to hide access key, follow URL below to continue');
@ -250,6 +248,28 @@ if ($options->favour_feed_titles == 'user') {
$favour_feed_titles = $options->favour_feed_titles;
}
///////////////////////////////////////////////
// Include full content in output?
///////////////////////////////////////////////
if ($options->content === 'user') {
if (isset($_GET['content']) && $_GET['content'] === '0') {
$options->content = false;
} else {
$options->content = true;
}
}
///////////////////////////////////////////////
// Include summaries in output?
///////////////////////////////////////////////
if ($options->summary === 'user') {
if (isset($_GET['summary']) && $_GET['summary'] === '1') {
$options->summary = true;
} else {
$options->summary = false;
}
}
///////////////////////////////////////////////
// Exclude items if extraction fails
///////////////////////////////////////////////
@ -272,15 +292,6 @@ if ($options->detect_language === 'user') {
$detect_language = $options->detect_language;
}
if ($detect_language >= 2) {
$language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg',
'cebuano' => 'ceb', // ISO 639-2
'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha',
'hawaiian' => 'haw', // ISO 639-2
'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps',
'pidgin' => 'cpe', // ISO 639-2
'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy');
}
$use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0);
/////////////////////////////////////
@ -330,7 +341,7 @@ if ($options->cors) header('Access-Control-Allow-Origin: *');
//////////////////////////////////
if ($options->caching) {
debug('Caching is enabled...');
$cache_id = md5($max.$url.$valid_key.$links.$favour_feed_titles.$xss_filter.$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));
$cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));
$check_cache = true;
if ($options->apc && $options->smart_cache) {
apc_add("cache.$cache_id", 0, 10*60);
@ -550,14 +561,33 @@ foreach ($items as $key => $item) {
$is_single_page = false;
if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
$is_single_page = true;
$html = $single_page_response['body'];
// remove strange things
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $single_page_response['headers']);
$effective_url = $single_page_response['effective_url'];
debug("Retrieved single-page view from $effective_url");
// check if action defined for returned Content-Type
$mime_info = get_mime_action_info($single_page_response['headers']);
if (isset($mime_info['action'])) {
if ($mime_info['action'] == 'exclude') {
continue; // skip this feed item entry
} elseif ($mime_info['action'] == 'link') {
if ($mime_info['type'] == 'image') {
$html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"{$mime_info['name']}\" /></a>";
} else {
$html = "<a href=\"$effective_url\">Download {$mime_info['name']}</a>";
}
$extracted_title = $mime_info['name'];
$do_content_extraction = false;
}
}
if ($do_content_extraction) {
$html = $single_page_response['body'];
// remove strange things
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $single_page_response['headers']);
debug("Retrieved single-page view from $effective_url");
}
unset($single_page_response);
}
}
if ($do_content_extraction) {
debug('--------');
debug('Attempting to extract content');
$extract_result = $extractor->process($html, $effective_url);
@ -567,7 +597,7 @@ foreach ($items as $key => $item) {
// Deal with multi-page articles
//die('Next: '.$extractor->getNextPageUrl());
$is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl());
if ($options->multipage && $is_multi_page) {
if ($options->multipage && $is_multi_page && $options->content) {
debug('--------');
debug('Attempting to process multi-page article');
$multi_page_urls = array();
@ -605,13 +635,15 @@ foreach ($items as $key => $item) {
// did we successfully deal with this multi-page article?
if (empty($multi_page_content)) {
debug('Failed to extract all parts of multi-page article, so not going to include them');
$multi_page_content[] = $readability->dom->createElement('p')->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';
$_page = $readability->dom->createElement('p');
$_page->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';
$multi_page_content[] = $_page;
}
foreach ($multi_page_content as $_page) {
$_page = $content_block->ownerDocument->importNode($_page, true);
$content_block->appendChild($_page);
}
unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url);
unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url, $_page);
}
}
// use extracted title for both feed and item title if we're using single-item dummy feed
@ -658,7 +690,7 @@ foreach ($items as $key => $item) {
} else {
$html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML
}
unset($content_block);
//unset($content_block);
// post-processing cleanup
$html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
if ($links == 'remove') {
@ -671,130 +703,155 @@ foreach ($items as $key => $item) {
}
}
if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
$newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));
if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
$newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));
} else {
$newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));
}
// filter xss?
if ($xss_filter) {
debug('Filtering HTML to remove XSS');
$html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1));
}
// add content
if ($options->summary === true) {
// get summary
$summary = '';
if (!$do_content_extraction) {
$summary = $html;
} else {
$newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));
}
// filter xss?
if ($xss_filter) {
debug('Filtering HTML to remove XSS');
$html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1));
}
$newitem->setDescription($html);
// set date
if ((int)$item->get_date('U') > 0) {
$newitem->setDate((int)$item->get_date('U'));
} elseif ($extractor->getDate()) {
$newitem->setDate($extractor->getDate());
}
// add authors
if ($authors = $item->get_authors()) {
foreach ($authors as $author) {
// for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel
if ($author->get_name() !== null) {
$newitem->addElement('dc:creator', $author->get_name());
} elseif ($author->get_email() !== null) {
$newitem->addElement('dc:creator', $author->get_email());
// Try to get first few paragraphs
if (isset($content_block) && ($content_block instanceof DOMElement)) {
$_paras = $content_block->getElementsByTagName('p');
foreach ($_paras as $_para) {
$summary .= preg_replace("/[\n\r\t ]+/", ' ', $_para->textContent).' ';
if (strlen($summary) > 200) break;
}
}
} elseif ($authors = $extractor->getAuthors()) {
//TODO: make sure the list size is reasonable
foreach ($authors as $author) {
// TODO: xpath often selects authors from other articles linked from the page.
// for now choose first item
$newitem->addElement('dc:creator', $author);
break;
} else {
$summary = $html;
}
}
unset($_paras, $_para);
$summary = get_excerpt($summary);
$newitem->setDescription($summary);
if ($options->content) $newitem->setElement('content:encoded', $html);
} else {
if ($options->content) $newitem->setDescription($html);
}
// add language
if ($detect_language) {
$language = $extractor->getLanguage();
if (!$language) $language = $feed->get_language();
if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) {
try {
if ($use_cld) {
// Use PHP-CLD extension
$php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error
$res = $php_cld($text_sample);
if (is_array($res) && count($res) > 0) {
$language = $res[0]['code'];
}
} else {
//die('what');
// Use PEAR's Text_LanguageDetect
if (!isset($l)) {
$l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat');
}
$l_result = $l->detect($text_sample, 1);
if (count($l_result) > 0) {
$language = $language_codes[key($l_result)];
}
// set date
if ((int)$item->get_date('U') > 0) {
$newitem->setDate((int)$item->get_date('U'));
} elseif ($extractor->getDate()) {
$newitem->setDate($extractor->getDate());
}
// add authors
if ($authors = $item->get_authors()) {
foreach ($authors as $author) {
// for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel
if ($author->get_name() !== null) {
$newitem->addElement('dc:creator', $author->get_name());
} elseif ($author->get_email() !== null) {
$newitem->addElement('dc:creator', $author->get_email());
}
}
} elseif ($authors = $extractor->getAuthors()) {
//TODO: make sure the list size is reasonable
foreach ($authors as $author) {
// TODO: xpath often selects authors from other articles linked from the page.
// for now choose first item
$newitem->addElement('dc:creator', $author);
break;
}
}
// add language
if ($detect_language) {
$language = $extractor->getLanguage();
if (!$language) $language = $feed->get_language();
if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) {
try {
if ($use_cld) {
// Use PHP-CLD extension
$php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error
$res = $php_cld($text_sample);
if (is_array($res) && count($res) > 0) {
$language = $res[0]['code'];
}
} catch (Exception $e) {
//die('error: '.$e);
// do nothing
}
}
if ($language && (strlen($language) < 7)) {
$newitem->addElement('dc:language', $language);
}
}
// add MIME type (if it appeared in our exclusions lists)
if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);
// add effective URL (URL after redirects)
if (isset($effective_url)) {
//TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
//http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-<2D>-25th-March-2012-Special-Program-from-Liari-(Karachi)
//temporary measure: use utf8_encode()
$newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
} else {
$newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
}
// add categories
if ($categories = $item->get_categories()) {
foreach ($categories as $category) {
if ($category->get_label() !== null) {
$newitem->addElement('category', $category->get_label());
}
}
}
// check for enclosures
if ($options->keep_enclosures) {
if ($enclosures = $item->get_enclosures()) {
foreach ($enclosures as $enclosure) {
// thumbnails
foreach ((array)$enclosure->get_thumbnails() as $thumbnail) {
$newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail));
} else {
//die('what');
// Use PEAR's Text_LanguageDetect
if (!isset($l)) {
$l = new Text_LanguageDetect();
$l->setNameMode(2); // return ISO 639-1 codes (e.g. "en")
}
$l_result = $l->detect($text_sample, 1);
if (count($l_result) > 0) {
$language = key($l_result);
}
if (!$enclosure->get_link()) continue;
$enc = array();
// Media RSS spec ($enc): http://search.yahoo.com/mrss
// SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
$enc['url'] = $enclosure->get_link();
if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
$newitem->addElement('media:content', '', $enc);
}
} catch (Exception $e) {
//die('error: '.$e);
// do nothing
}
}
/* } */
if ($language && (strlen($language) < 7)) {
$newitem->addElement('dc:language', $language);
}
}
// add MIME type (if it appeared in our exclusions lists)
if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);
// add effective URL (URL after redirects)
if (isset($effective_url)) {
//TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
//http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-<2D>-25th-March-2012-Special-Program-from-Liari-(Karachi)
//temporary measure: use utf8_encode()
$newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
} else {
$newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
}
// add categories
if ($categories = $item->get_categories()) {
foreach ($categories as $category) {
if ($category->get_label() !== null) {
$newitem->addElement('category', $category->get_label());
}
}
}
// check for enclosures
if ($options->keep_enclosures) {
if ($enclosures = $item->get_enclosures()) {
foreach ($enclosures as $enclosure) {
// thumbnails
foreach ((array)$enclosure->get_thumbnails() as $thumbnail) {
$newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail));
}
if (!$enclosure->get_link()) continue;
$enc = array();
// Media RSS spec ($enc): http://search.yahoo.com/mrss
// SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
$enc['url'] = $enclosure->get_link();
if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
$newitem->addElement('media:content', '', $enc);
}
}
}
$output->addItem($newitem);
unset($html);
$item_count++;

View File

@ -66,6 +66,38 @@ class DummySingleItem {
// HELPER FUNCTIONS
///////////////////////////////
// Adapted from WordPress
// http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173
function get_excerpt($text, $num_words=55, $more=null) {
if (null === $more) $more = '&hellip;';
$text = strip_tags($text);
//TODO: Check if word count is based on single characters (East Asian characters)
/*
if (1==2) {
$text = trim(preg_replace("/[\n\r\t ]+/", ' ', $text), ' ');
preg_match_all('/./u', $text, $words_array);
$words_array = array_slice($words_array[0], 0, $num_words + 1);
$sep = '';
} else {
$words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
$sep = ' ';
}
*/
$words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
$sep = ' ';
if (count($words_array) > $num_words) {
array_pop($words_array);
$text = implode($sep, $words_array);
$text = $text.$more;
} else {
$text = implode($sep, $words_array);
}
// trim whitespace at beginning or end of string
// See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2
$text = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $text);
return $text;
}
function url_allowed($url) {
global $options;
if (!empty($options->allowed_urls)) {
@ -165,14 +197,6 @@ function convert_to_utf8($html, $header=null)
if (strtolower($encoding) != 'utf-8') {
debug('Converting to UTF-8');
$html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
/*
if (function_exists('iconv')) {
// iconv appears to handle certain character encodings better than mb_convert_encoding
$html = iconv($encoding, 'utf-8', $html);
} else {
$html = mb_convert_encoding($html, 'utf-8', $encoding);
}
*/
}
}
}

View File

@ -1,3 +1,2 @@
<?php
// this is here to prevent directory listing over the web
?>

View File

@ -1 +1 @@
4
2013-05-12T22:53:07Z