diff --git a/inc/JSLikeHTMLElement.php b/inc/JSLikeHTMLElement.php index dfcc1be..238ba8a 100644 --- a/inc/JSLikeHTMLElement.php +++ b/inc/JSLikeHTMLElement.php @@ -4,7 +4,7 @@ * * This class extends PHP's DOMElement to allow * users to get and set the innerHTML property of -* HTML elements in the same way it's done in +* HTML elements in the same way it's done in * JavaScript. * * Example usage: @@ -15,16 +15,16 @@ * $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); * $doc->loadHTML('
Para 1
Para 2
Para 1
Para 2
' * echo "\n\n"; -* +* * // set innerHTML * $elem->innerHTML = 'FiveFilters.org'; * echo $elem->innerHTML; // prints 'FiveFilters.org' * echo "\n\n"; -* +* * // print document (with our changes) * echo $doc->saveXML(); * @endcode @@ -59,7 +59,7 @@ class JSLikeHTMLElement extends DOMElement $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); // UsingSorry, Readability was unable to parse this page for content.
'; + $articleContent->innerHTML = 'Sorry, Readability was unable to parse this page for content.
'; } - + $overlay->setAttribute('id', 'readOverlay'); $innerDiv->setAttribute('id', 'readInner'); @@ -201,7 +201,7 @@ class Readability $innerDiv->appendChild($articleTitle); $innerDiv->appendChild($articleContent); $overlay->appendChild($innerDiv); - + /* Clear the old HTML, insert the new content. */ $this->body->innerHTML = ''; $this->body->appendChild($overlay); @@ -209,21 +209,21 @@ class Readability $this->body->removeAttribute('style'); $this->postProcessContent($articleContent); - + // Set title and content instance variables $this->articleTitle = $articleTitle; $this->articleContent = $articleContent; - + return $this->success; } - + /** * Debug */ protected function dbg($msg) { if ($this->debug) echo '* ',$msg, "\n"; } - + /** * Run any post-process modifications to article content as necessary. * @@ -231,11 +231,11 @@ class Readability * @return void */ public function postProcessContent($articleContent) { - if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { + if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { $this->addFootnotes($articleContent); } } - + /** * Get the article title as an H1. * @@ -248,11 +248,11 @@ class Readability try { $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); } catch(Exception $e) {} - + if (preg_match('/ [\|\-] /', $curTitle)) { $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); - + if (count(explode(' ', $curTitle)) < 3) { $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); } @@ -279,17 +279,17 @@ class Readability if (count(explode(' ', $curTitle)) <= 4) { $curTitle = $origTitle; } - + $articleTitle = $this->dom->createElement('h1'); $articleTitle->innerHTML = $curTitle; - + return $articleTitle; } - + /** * Prepare the HTML document for readability to scrape it. * This includes things like stripping javascript, CSS, and handling terrible markup. - * + * * @return void **/ protected function prepDocument() { @@ -328,13 +328,13 @@ class Readability $footnotesWrapper = $this->dom->createElement('div'); $footnotesWrapper->setAttribute('id', 'readability-footnotes'); $footnotesWrapper->innerHTML = ' tags, etc.
@@ -429,7 +429,7 @@ class Readability
* as a header and not a subheader, so remove it since we already have a header.
***/
if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
- $this->clean($articleContent, 'h2');
+ $this->clean($articleContent, 'h2');
}
$this->clean($articleContent, 'iframe');
@@ -448,7 +448,7 @@ class Readability
$embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
$objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
$iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
-
+
if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
{
$articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
@@ -457,13 +457,13 @@ class Readability
try {
$articleContent->innerHTML = preg_replace('/
]*>\s*
innerHTML);
- //articleContent.innerHTML = articleContent.innerHTML.replace(/
]*>\s*
]*>\s*
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); } } - + /** * Initialize a node with the readability object. Also checks the * className/id for special names to add to its score. @@ -474,7 +474,7 @@ class Readability protected function initializeNode($node) { $readability = $this->dom->createAttribute('readability'); $readability->value = 0; // this is our contentScore - $node->setAttributeNode($readability); + $node->setAttributeNode($readability); switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case case 'DIV': @@ -486,7 +486,7 @@ class Readability case 'BLOCKQUOTE': $readability->value += 3; break; - + case 'ADDRESS': case 'OL': case 'UL': @@ -510,7 +510,7 @@ class Readability } $readability->value += $this->getClassWeight($node); } - + /*** * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. @@ -548,7 +548,7 @@ class Readability $node->parentNode->removeChild($node); $nodeIndex--; continue; - } + } } if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { @@ -589,7 +589,7 @@ class Readability } } } - + /** * Loop through all paragraphs, and assign a score to them based on how content-y they look. * Then add their score to their parent node. @@ -613,7 +613,7 @@ class Readability } /* Initialize readability data for the parent. */ - if (!$parentNode->hasAttribute('readability')) + if (!$parentNode->hasAttribute('readability')) { $this->initializeNode($parentNode); $candidates[] = $parentNode; @@ -633,15 +633,15 @@ class Readability /* Add points for any commas within this paragraph */ $contentScore += count(explode(',', $innerText)); - + /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ $contentScore += min(floor(strlen($innerText) / 100), 3); - + /* Add the score to the parent. The grandparent gets half. */ $parentNode->getAttributeNode('readability')->value += $contentScore; if ($grandParentNode) { - $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; + $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; } } @@ -727,12 +727,12 @@ class Readability { $append = true; } - + if (strtoupper($siblingNode->nodeName) == 'P') { $linkDensity = $this->getLinkDensity($siblingNode); $nodeContent = $this->getInnerText($siblingNode); $nodeLength = strlen($nodeContent); - + if ($nodeLength > 80 && $linkDensity < 0.25) { $append = true; @@ -751,7 +751,7 @@ class Readability $sibNodeName = strtoupper($siblingNode->nodeName); if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ - + $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); $nodeToAppend = $this->dom->createElement('div'); try { @@ -770,7 +770,7 @@ class Readability $s--; $sl--; } - + /* To ensure a node does not interfere with readability styles, remove its classnames */ $nodeToAppend->removeAttribute('class'); @@ -796,14 +796,14 @@ class Readability // in the meantime, we check and create an empty element if it's not there. if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); $this->body->innerHTML = $this->bodyCache; - + if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); return $this->grabArticle($this->body); } else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { $this->removeFlag(self::FLAG_WEIGHT_CLASSES); - return $this->grabArticle($this->body); + return $this->grabArticle($this->body); } else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); @@ -815,7 +815,7 @@ class Readability } return $articleContent; } - + /** * Remove script tags from document * @@ -829,7 +829,7 @@ class Readability $scripts->item($i)->parentNode->removeChild($scripts->item($i)); } } - + /** * Get the inner text of a node. * This also strips out any excess whitespace to be found. @@ -878,11 +878,11 @@ class Readability $elem->removeAttribute('style'); } } - + /** * Get the density of links as a percentage of the content * This is the amount of text that is inside a link divided by the total text in the node. - * + * * @param DOMElement $e * @return number (float) */ @@ -900,9 +900,9 @@ class Readability return 0; } } - + /** - * Get an elements class/id weight. Uses regular expressions to tell if this + * Get an elements class/id weight. Uses regular expressions to tell if this * element looks good or bad. * * @param DOMElement $e @@ -964,7 +964,7 @@ class Readability public function clean($e, $tag) { $targetList = $e->getElementsByTagName($tag); $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); - + for ($y=$targetList->length-1; $y >= 0; $y--) { /* Allow youtube and vimeo videos through as people usually want to see those. */ if ($isEmbed) { @@ -972,7 +972,7 @@ class Readability for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) } - + /* First, check the elements attributes to see if any of them contain youtube or vimeo */ if (preg_match($this->regexps['video'], $attributeValues)) { continue; @@ -986,10 +986,10 @@ class Readability $targetList->item($y)->parentNode->removeChild($targetList->item($y)); } } - + /** * Clean an element of all tags of type "tag" if they look fishy. - * "Fishy" is an algorithm based on content length, classnames, + * "Fishy" is an algorithm based on content length, classnames, * link density, number of images & embeds, etc. * * @param DOMElement $e @@ -1013,7 +1013,7 @@ class Readability for ($i=$curTagsLength-1; $i >= 0; $i--) { $weight = $this->getClassWeight($tagsList->item($i)); $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; - + $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); if ($weight + $contentScore < 0) { @@ -1034,13 +1034,13 @@ class Readability $embeds = $tagsList->item($i)->getElementsByTagName('embed'); for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { - $embedCount++; + $embedCount++; } } $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { - $embedCount++; + $embedCount++; } } @@ -1058,7 +1058,7 @@ class Readability $toRemove = true; } else if ( $input > floor($p/3) ) { $this->dbg(' too many elements'); - $toRemove = true; + $toRemove = true; } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); $toRemove = true; @@ -1082,7 +1082,7 @@ class Readability $toRemove = true; } else if ( $input > floor($p/3) ) { $this->dbg(' too many elements'); - $toRemove = true; + $toRemove = true; } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); $toRemove = true; @@ -1126,11 +1126,11 @@ class Readability public function flagIsActive($flag) { return ($this->flags & $flag) > 0; } - + public function addFlag($flag) { $this->flags = $this->flags | $flag; } - + public function removeFlag($flag) { $this->flags = $this->flags & ~$flag; }