1
0
mirror of https://github.com/moparisthebest/wallabag synced 2024-11-23 09:32:15 -05:00

updated specific configuration for parsing

This commit is contained in:
Nicolas Lœuillet 2014-07-13 10:15:40 +02:00
parent 58dbe10388
commit 4e067ceabd
952 changed files with 7585 additions and 5682 deletions

8
inc/3rdparty/site_config/standard/24ways.org.txt vendored Normal file → Executable file
View File

@ -1,6 +1,6 @@
title: //div[@class='meta']/h2/a title: //div[@class='meta']/h2/a
author: //div[@class='meta']/h2/following-sibling::p/a/text() author: //div[@class='meta']/h2/following-sibling::p/a/text()
date://div[@class='meta']/h2/strong date://div[@class='meta']/h2/strong
body: //div[@id='article'] body: //div[@id='article']
strip: //div[@class='domore'] strip: //div[@class='domore']
test_url: http://24ways.org/2011/composing-the-new-canon test_url: http://24ways.org/2011/composing-the-new-canon

View File

@ -0,0 +1,8 @@
title: //h1[contains(@class, 'entry-title')]
date: //meta[@name='weibo: article:create_at']/@content
body: //div[contains(@class, 'mainContent')]
strip_id_or_class: related_topics
prune: no
test_url: http://www.36kr.com/p/207879.html

8
inc/3rdparty/site_config/standard/37signals.com.txt vendored Normal file → Executable file
View File

@ -1,6 +1,6 @@
title: //div[@class='post_header']//h2/a title: //div[@class='post_header']//h2/a
author: //span[@class='author'] author: //span[@class='author']
date: //span[@class='date'] date: //span[@class='date']
body: //div[@id='Content'] body: //div[@id='Content']
test_url: http://37signals.com/svn/posts/2785-the-end-of-the-it-department test_url: http://37signals.com/svn/posts/2785-the-end-of-the-it-department

16
inc/3rdparty/site_config/standard/3quarksdaily.com.txt vendored Normal file → Executable file
View File

@ -1,9 +1,9 @@
body: //div[@class='content'] body: //div[@class='content']
date: //div[@class='content']/h2 date: //div[@class='content']/h2
strip: //div[@class='content']/h2 strip: //div[@class='content']/h2
title: //div[@class='content']/h3 title: //div[@class='content']/h3
strip: //div[@id='postmenu'] strip: //div[@id='postmenu']
strip: //div[@class='trackback'] strip: //div[@class='trackback']
tidy: no tidy: no
test_url: http://www.3quarksdaily.com/3quarksdaily/2012/01/martin-luther-king-i-have-a-dream.html test_url: http://www.3quarksdaily.com/3quarksdaily/2012/01/martin-luther-king-i-have-a-dream.html

0
inc/3rdparty/site_config/standard/3voor12.vpro.nl.txt vendored Normal file → Executable file
View File

4
inc/3rdparty/site_config/standard/43folders.com.txt vendored Normal file → Executable file
View File

@ -1,4 +1,4 @@
body: //*[@class = 'content'] body: //*[@class = 'content']
author: //*[@class = 'submitted']/a author: //*[@class = 'submitted']/a
date: substring-after(//*[@class = 'submitted']/text(), '|') date: substring-after(//*[@class = 'submitted']/text(), '|')
test_url: http://www.43folders.com/2011/04/22/cranking test_url: http://www.43folders.com/2011/04/22/cranking

50
inc/3rdparty/site_config/standard/500px.com.txt vendored Normal file → Executable file
View File

@ -1,27 +1,27 @@
# very loose setup for both 500px.com/photo/* and 500px.com/blog/* # very loose setup for both 500px.com/photo/* and 500px.com/blog/*
# photo page example: http://500px.com/photo/4181666 # photo page example: http://500px.com/photo/4181666
# blog page example: http://500px.com/blog/110 # blog page example: http://500px.com/blog/110
# avoid "no text" error # avoid "no text" error
tidy:no tidy:no
prune:no prune:no
# reorganize photo page elements # reorganize photo page elements
#body://div[contains(@class,'container')] #body://div[contains(@class,'container')]
move_into(body)://div[contains(@id,'thephoto')] move_into(body)://div[contains(@id,'thephoto')]
move_into(body)://div[contains(@id,'description')] move_into(body)://div[contains(@id,'description')]
move_into(body)://div[contains(@id,'tags')] move_into(body)://div[contains(@id,'tags')]
move_into(body)://div[contains(@id,'photo-info')] move_into(body)://div[contains(@id,'photo-info')]
# clean photo page info # clean photo page info
strip://span[contains(@id,'copyright')] strip://span[contains(@id,'copyright')]
strip://*[contains(@id,'store')] strip://*[contains(@id,'store')]
strip://*[contains(@id,'user-info')] strip://*[contains(@id,'user-info')]
strip://*[contains(@id,'photo-stats')] strip://*[contains(@id,'photo-stats')]
strip://*[contains(@id,'voting_controls_container')] strip://*[contains(@id,'voting_controls_container')]
strip://*[contains(@id,'more-photos')] strip://*[contains(@id,'more-photos')]
strip://*[contains(@id,'embed-photo')] strip://*[contains(@id,'embed-photo')]
# clean blog page side bar # clean blog page side bar
strip://*[contains(@class,'col d3 clearafter')] strip://*[contains(@class,'col d3 clearafter')]
test_url: http://500px.com/photo/3641041?from=editors test_url: http://500px.com/photo/3641041?from=editors

0
inc/3rdparty/site_config/standard/512pixels.net.txt vendored Normal file → Executable file
View File

14
inc/3rdparty/site_config/standard/5by5.tv.txt vendored Normal file → Executable file
View File

@ -1,9 +1,9 @@
body: //*[@id="episode"] body: //*[@id="episode"]
prune: no prune: no
tidy: no tidy: no
autodetect_next_page: no autodetect_next_page: no
strip_id_or_class: player strip_id_or_class: player
strip://*[@id="header"] strip://*[@id="header"]
test_url: http://5by5.tv/buildanalyze/60 test_url: http://5by5.tv/buildanalyze/60

View File

@ -0,0 +1,7 @@
title: //*[@id='sstitle']
body: //div[@id='sstory']
strip_id_or_class: newsoptions
prune: no
test_url: http://www.7newsbelize.com/sstory.php?nid=25654
test_url: http://www.7newsbelize.com/7news.xml

14
inc/3rdparty/site_config/standard/944.com.txt vendored Normal file → Executable file
View File

@ -1,9 +1,9 @@
title: //h2[@class='border'] title: //h2[@class='border']
body: //div[@class='padding'] body: //div[@class='padding']
convert_double_br_tags: yes convert_double_br_tags: yes
strip: //div[@id='social_sharing'] strip: //div[@id='social_sharing']
strip: //div[@class='socialLinks'] strip: //div[@class='socialLinks']
test_url: http://www.944.com/articles/mild-obsessions-frock-la-get-to-know-victoria-tik-s-haute-sustainable-fashion-line/ test_url: http://www.944.com/articles/mild-obsessions-frock-la-get-to-know-victoria-tik-s-haute-sustainable-fashion-line/

38
inc/3rdparty/site_config/standard/README.md vendored Executable file
View File

@ -0,0 +1,38 @@
Full-Text RSS site config files
================
[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no site patterns, it tries to detect the content block automatically.
This repository contains the site config files we use in Full-Text RSS.
### Contributing changes
We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface.
You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model:
> The Fork & Pull Model lets anyone fork an existing repository and push changes to their personal fork without requiring access be granted to the source repository. The changes must then be pulled into the source repository by the project maintainer. This model reduces the amount of friction for new contributors and is popular with open source projects because it allows people to work independently without upfront coordination.
When we receive a pull request we'll review the changes and if everything's okay we'll update our copy.
If a site is not in our set, you can create a file for it in the same way. See [Creating files on GitHub](https://github.com/blog/1327-creating-files-on-github).
### How to write a site config file
The quickest and simplest way is to use our [point-and-click interface](http://siteconfig.fivefilters.org). It's a simple tool only intended to create a rule to extract the correct content block.
For further refinements, e.g. selecting the title, stripping elements, dealing with multi-page articles, please see our [help page](http://help.fivefilters.org/customer/portal/articles/223153-site-patterns).
### Instapaper
When we introduced site patterns, we chose to adopt the [same format](http://blog.instapaper.com/post/730281947) used by Instapaper. This allows us to make use of the existing extraction rules contributed by Instapaper users.
Marco, Instapaper's creator, graciously opened up the database of contributions to everyone:
> And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached.
Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (login required).
### Testing site config files
Currently you will have to have a copy of Full-Text RSS to test changes to the site config files. In the future we will try to make this process easier.

18
inc/3rdparty/site_config/standard/aachener-nachrichten.de.txt vendored Normal file → Executable file
View File

@ -1,10 +1,10 @@
title: //meta[@property='og:title']/@content title: //meta[@property='og:title']/@content
body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")] body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
strip_id_or_class: socialshareprivacy1 strip_id_or_class: socialshareprivacy1
strip_id_or_class: zvaFacebookButton strip_id_or_class: zvaFacebookButton
tidy: no tidy: no
prune: no prune: no
test_url: http://www.aachener-nachrichten.de/lokales/aachen-detail-an/2517757 test_url: http://www.aachener-nachrichten.de/lokales/aachen-detail-an/2517757

18
inc/3rdparty/site_config/standard/aachener-zeitung.de.txt vendored Normal file → Executable file
View File

@ -1,10 +1,10 @@
title: //meta[@property='og:title']/@content title: //meta[@property='og:title']/@content
body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")] body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
strip_id_or_class: socialshareprivacy1 strip_id_or_class: socialshareprivacy1
strip_id_or_class: zvaFacebookButton strip_id_or_class: zvaFacebookButton
tidy: no tidy: no
prune: no prune: no
test_url: http://www.aachener-zeitung.de/sixcms/detail.php?template=az_detail&id=2552718 test_url: http://www.aachener-zeitung.de/sixcms/detail.php?template=az_detail&id=2552718

10
inc/3rdparty/site_config/standard/abc.es.txt vendored Normal file → Executable file
View File

@ -1,7 +1,7 @@
title: //meta[@property='og:title']/@content title: //meta[@property='og:title']/@content
body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text'] body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text' or @itemprop='articleBody']
strip_id_or_class: colB strip_id_or_class: colB
prune: no prune: no
test_url: http://www.abc.es/20120209/tv-series/abci-house-ultima-temporada-201202090936.html test_url: http://www.abc.es/20120209/tv-series/abci-house-ultima-temporada-201202090936.html

26
inc/3rdparty/site_config/standard/abc.net.au.txt vendored Normal file → Executable file
View File

@ -1,10 +1,18 @@
title: //h1 title: //div[@class='article section']//h1
author: //div[@class="byline"]/a author: //div[@class="byline"]/a
date: //span[@class="timestamp"] date: //span[@class="timestamp"]
body: //div[@class="page section"]
strip: //p[@class="topics"]
strip: //h1 strip: //a[@class="inline-caption"]
strip: //div[@class="byline"] strip: //p[@class="ticker section noprint"]
strip: //p[@class="published"] strip: //p[@class="topics"]
strip: //h1
strip: //div[@class="byline"]
strip: //p[@class="published"]
strip: //div[contains(@class,"featured-scroller")] strip: //div[contains(@class,"featured-scroller")]
test_url: http://www.abc.net.au/news/2011-11-08/crabb-carbon-legislation-abbott-demolition/3652544 strip_id_or_class: footer
tidy: no
test_url: http://www.abc.net.au/news/2013-03-27/open-speed-highways-change-clp-giles/4597892
test_url: http://www.abc.net.au/news/2013-04-30/credit-growth-remains-subdued/4660054?section=business

52
inc/3rdparty/site_config/standard/abcnews.go.com.txt vendored Normal file → Executable file
View File

@ -1,27 +1,27 @@
title: //h1[@class='headline'] title: //h1[@class='headline']
body: //div[@id='storyText'] body: //div[@id='storyText']
# for video entries # for video entries
body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')] body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')]
author: //div[@class='byline'] author: //div[@class='byline']
date: //div[@class='date'] date: //div[@class='date']
strip: //*[@id='date_partner'] strip: //*[@id='date_partner']
strip: //div[@class='breadcrumb'] strip: //div[@class='breadcrumb']
strip: //div[contains(@class,'show_tools')] strip: //div[contains(@class,'show_tools')]
strip: //div[@id='sponsoredByAd'] strip: //div[@id='sponsoredByAd']
strip: //div[contains(@class,'rel_container')] strip: //div[contains(@class,'rel_container')]
strip: //p[a[starts-with(@href, 'http://www.twitter.com')]] strip: //p[a[starts-with(@href, 'http://www.twitter.com')]]
strip: //p[a[starts-with(@href, 'http://www.facebook.com')]] strip: //p[a[starts-with(@href, 'http://www.facebook.com')]]
strip: //p[contains(., 'Click here to return to')] strip: //p[contains(., 'Click here to return to')]
#strip_id_or_class: media #strip_id_or_class: media
strip_id_or_class: mediaplayer strip_id_or_class: mediaplayer
replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http
prune: no prune: no
single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true') single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true')
test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744 test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744
# multi-page # multi-page
test_url: http://abcnews.go.com/Blotter/family-freed-american-hostage-somalia-seals-obama/story?id=15439544 test_url: http://abcnews.go.com/Blotter/family-freed-american-hostage-somalia-seals-obama/story?id=15439544

16
inc/3rdparty/site_config/standard/accesstoinsight.org.txt vendored Normal file → Executable file
View File

@ -1,9 +1,9 @@
title: //div[@id='H_docTitle'] title: //div[@id='H_docTitle']
body: //div[@id='H_meta' or @id='H_content' or @id='F_footer'] body: //div[@id='H_meta' or @id='H_content' or @id='F_footer']
strip_id_or_class: F_toenail strip_id_or_class: F_toenail
prune: no prune: no
test_url: http://www.accesstoinsight.org/lib/authors/nyanaponika/wheel026.html test_url: http://www.accesstoinsight.org/lib/authors/nyanaponika/wheel026.html

4
inc/3rdparty/site_config/standard/acidcow.com.txt vendored Normal file → Executable file
View File

@ -1,3 +1,3 @@
body: //div[starts-with(@id, 'news-id-')] body: //div[starts-with(@id, 'news-id-')]
test_url: http://acidcow.com/fun/20933-acid-picdump-83-pics.html test_url: http://acidcow.com/fun/20933-acid-picdump-83-pics.html

14
inc/3rdparty/site_config/standard/acquia.com.txt vendored Normal file → Executable file
View File

@ -1,9 +1,9 @@
title://h1[@class="title"] title://h1[@class="title"]
author://div[@class="submitted"]/span/a author://div[@class="submitted"]/span/a
date://div[@class="submitted"]/span date://div[@class="submitted"]/span
body://div[@class="content-wrapper"] body://div[@class="content-wrapper"]
strip://div[@id="skip-link"] strip://div[@id="skip-link"]
strip://div[@id="region-content-3-3"] strip://div[@id="region-content-3-3"]
strip://div[@id="section-footer"] strip://div[@id="section-footer"]
test_url: https://www.acquia.com/blog/drupals-long-warmth-toward-third-party-code test_url: https://www.acquia.com/blog/drupals-long-warmth-toward-third-party-code

6
inc/3rdparty/site_config/standard/acroswing.fr.txt vendored Normal file → Executable file
View File

@ -1,5 +1,5 @@
tidy:no tidy:no
date: //time[@class='updated'] date: //time[@class='updated']
dissolve: //ul[@class='video-gallery']/li dissolve: //ul[@class='video-gallery']/li
dissolve: //ul[@class='video-gallery'] dissolve: //ul[@class='video-gallery']
test_url: http://www.acroswing.fr/actualites/competition_rock/selectif_bellegarde_sur_valserine__2012-02-26.php test_url: http://www.acroswing.fr/actualites/competition_rock/selectif_bellegarde_sur_valserine__2012-02-26.php

View File

@ -0,0 +1,5 @@
title: //h1[@class='articleTitle ']
body: //div[@class='bodyText widget storyContent']
strip: //p/span[@class='quote']/..
strip_id_or_class: 'pull1'
test_url: https://www.aftenposten.no/meninger/spaltister/Portrett-av-scenekunstneren-som-ung-mann-7167959.html

View File

@ -0,0 +1,13 @@
author: //article//address[contains(@class, 'author')]
body: //article[.//div[contains(@class, 'abBodyText')]]//*[contains(@class, 'abLeadText') or contains(@class, 'abBodyText') or contains(@class, 'abImageBlock') or contains(@class, 'abIGSatellite')]
strip: //address//img
strip: //footer
strip_id_or_class: abSticky
prune: no
test_url: http://www.aftonbladet.se/sportbladet/hockey/sverige/allsvenskan/article17498194.ab
test_url: http://www.aftonbladet.se/debatt/article16207536.ab
test_url: http://www.aftonbladet.se/debatt/debattamnen/politik/article17483377.ab
test_url: http://www.aftonbladet.se/rss.xml

26
inc/3rdparty/site_config/standard/aht.seriouseats.com.txt vendored Normal file → Executable file
View File

@ -1,15 +1,15 @@
body: //div[@id='content'] body: //div[@id='content']
# clean up recipe pages # clean up recipe pages
strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']
#recipe pages #recipe pages
strip_id_or_class: "recipe-feedback" strip_id_or_class: "recipe-feedback"
strip_id_or_class: "comments" strip_id_or_class: "comments"
strip_id_or_class: "procedure-number" strip_id_or_class: "procedure-number"
strip_id_or_class: "more-with-author" strip_id_or_class: "more-with-author"
#slice #slice
strip_id_or_class: "inner" strip_id_or_class: "inner"
test_url: http://aht.seriouseats.com/archives/2009/12/the-burger-lab-salting-ground-beef.html test_url: http://aht.seriouseats.com/archives/2009/12/the-burger-lab-salting-ground-beef.html

View File

@ -0,0 +1,6 @@
body: //div[@id='main-column']//div[@class='content']
prune: no
test_url: http://www.albayan.ae/across-the-uae/education/2013-08-29-1.1949645
test_url: http://www.albayan.ae/1.448?ot=ot.AjaxPageLayout

0
inc/3rdparty/site_config/standard/alex.mullr.net.txt vendored Normal file → Executable file
View File

View File

@ -0,0 +1,4 @@
body: //section[@class='content']
date: //span[1]
author: //h1[@id='sitetitle']
test_url: https://alexduner.com/blog/2013/1/something-i-learned-today

View File

@ -0,0 +1,4 @@
body: //section[@class='content']
date: //span[1]
author: //h1[@id='sitetitle']
test_url: https://alexduner.squarespace.com/blog/2013/1/tech-culture-from-the-outside-looking-in

20
inc/3rdparty/site_config/standard/alistapart.com.txt vendored Normal file → Executable file
View File

@ -1,12 +1,12 @@
title: //h1[@class='title'] title: //h1[@class='title']
author: //h3[@class='byline']/a author: //h3[@class='byline']/a
date: //div[@class='ishinfo'] date: //div[@class='ishinfo']
body: //*[@id='articletext'] body: //*[@id='articletext']
strip_id_or_class: 'ishinfo' strip_id_or_class: 'ishinfo'
strip_id_or_class: 'metastuff' strip_id_or_class: 'metastuff'
strip_id_or_class: 'learnmore' strip_id_or_class: 'learnmore'
strip_id_or_class: 'discuss' strip_id_or_class: 'discuss'
prune: no prune: no
test_url: http://www.alistapart.com/articles/organizing-mobile/ test_url: http://www.alistapart.com/articles/organizing-mobile/

14
inc/3rdparty/site_config/standard/aljazeera.com.txt vendored Normal file → Executable file
View File

@ -1,8 +1,8 @@
title: //span[@id='DetailedTitle'] title: //span[@id='DetailedTitle']
body: //td[@id='tdTextContent'] body: //td[@id='tdTextContent']
strip_id_or_class: Skyscrapper_Body strip_id_or_class: Skyscrapper_Body
date: //span[@id='ctl00_cphBody_lblDate'] date: //span[@id='ctl00_cphBody_lblDate']
author: //div[@id="dvAuthorInfo"]//a/text() author: //div[@id="dvAuthorInfo"]//a/text()
strip: //table[ tbody/tr/td/object ] strip: //table[ tbody/tr/td/object ]
prune: no prune: no
test_url: http://www.aljazeera.com/indepth/opinion/2012/01/2012114121925380575.html test_url: http://www.aljazeera.com/indepth/opinion/2012/01/2012114121925380575.html

24
inc/3rdparty/site_config/standard/allrecipes.com.txt vendored Normal file → Executable file
View File

@ -1,14 +1,14 @@
title: //h1[@id='itemTitle'] title: //h1[@id='itemTitle']
body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')] body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')]
strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right'] strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right']
strip: //div[contains(@class, 'rightcoltoolsdiv')] strip: //div[contains(@class, 'rightcoltoolsdiv')]
strip: //div[contains(@class, 'servings-form')] strip: //div[contains(@class, 'servings-form')]
strip: //p[@class='nutritional-information'] strip: //p[@class='nutritional-information']
strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')] strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')]
strip: //div[@id='nutri-info']/div[contains(@class, 'title')] strip: //div[@id='nutri-info']/div[contains(@class, 'title')]
strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter'] strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter']
strip_id_or_class: eshaAttribute strip_id_or_class: eshaAttribute
strip_id_or_class: eshaParagraph strip_id_or_class: eshaParagraph
prune: no prune: no
test_url: http://allrecipes.com/Recipe/Taco-Pie/Detail.aspx?src=rotd test_url: http://allrecipes.com/Recipe/Taco-Pie/Detail.aspx?src=rotd

21
inc/3rdparty/site_config/standard/allthingsd.com.txt vendored Normal file → Executable file
View File

@ -1,10 +1,13 @@
title://div[@class="article-title"]/h1[@class="title"] title://div[@class="article-title"]/h1[@class="title"]
date: //p[@class="article-date"] date: //p[@class="article-date"]
body://*[@class="article-body article-text"] body://div[contains(@class, "article-body")]
# Trim out related posts at bottom of article # Trim out related posts at bottom of article
strip://blockquote[@class="memo"] strip://blockquote[@class="memo"]
# Yup, no idea why author won't work... tidy: no
author://div[@class="page-header article-header clearfix"]/p[@class="title"]
# Yup, no idea why author won't work...
author://div[@class="page-header article-header clearfix"]/p[@class="title"]
# [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it. # [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it.
test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/ test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/
test_url: http://allthingsd.com/20131010/google-cio-ben-fried-on-how-google-works/

12
inc/3rdparty/site_config/standard/allyou.com.txt vendored Normal file → Executable file
View File

@ -1,8 +1,8 @@
title: //div[@id='pageHdr']//h1 title: //div[@id='pageHdr']//h1
body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint'] body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint']
strip: //div[contains(@class, 'infoBox') or @id='infoBox'] strip: //div[contains(@class, 'infoBox') or @id='infoBox']
single_page_link: //li[@id='print']/a single_page_link: //li[@id='print']/a
prune: no prune: no
test_url: http://www.allyou.com/budget-home/money-shopping/freebies-online-00400000066392/ test_url: http://www.allyou.com/budget-home/money-shopping/freebies-online-00400000066392/

18
inc/3rdparty/site_config/standard/alphabeta.argaam.com.txt vendored Normal file → Executable file
View File

@ -1,11 +1,11 @@
body: //div[@class = 'entry'] body: //div[@class = 'entry']
date: substring-after(//p[@class="date"],'بتاريخ ') date: substring-after(//p[@class="date"],'بتاريخ ')
strip_id_or_class: date strip_id_or_class: date
strip_id_or_class: follow-single strip_id_or_class: follow-single
strip_id_or_class: ratingblock strip_id_or_class: ratingblock
strip_id_or_class: newRatingHolder strip_id_or_class: newRatingHolder
strip_id_or_class: postmetadata strip_id_or_class: postmetadata
strip_id_or_class: addthis_toolbox strip_id_or_class: addthis_toolbox
strip_id_or_class: addthis_default_style strip_id_or_class: addthis_default_style
strip_id_or_class: size-full strip_id_or_class: size-full
test_url: http://alphabeta.argaam.com/?p=35657 test_url: http://alphabeta.argaam.com/?p=35657

16
inc/3rdparty/site_config/standard/alriyadh.com.txt vendored Normal file → Executable file
View File

@ -1,9 +1,9 @@
body: //div[@id = "article-view"] body: //div[@id = "article-view"]
body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')] body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')]
author: //p[@class = "author"] author: //p[@class = "author"]
strip: //h1 strip: //h1
strip: //h2 strip: //h2
strip_id_or_class: author strip_id_or_class: author
prune: no prune: no
test_url: http://www.alriyadh.com/2011/10/10/article674357.html test_url: http://www.alriyadh.com/2011/10/10/article674357.html
test_url: http://www.alriyadh.com/net/article/780935 test_url: http://www.alriyadh.com/net/article/780935

0
inc/3rdparty/site_config/standard/alseraj.net.txt vendored Normal file → Executable file
View File

0
inc/3rdparty/site_config/standard/alt1040.com.txt vendored Normal file → Executable file
View File

View File

@ -0,0 +1,4 @@
single_page_link: //div[contains(@class, 'story_tools')]//a[contains(@href, '/print/')]
test_url: http://www.alternet.org/civil-liberties/noam-chomsky-surveillance-state-beyond-imagination-being-created-one-freest
test_url: http://feeds.feedblitz.com/alternet

0
inc/3rdparty/site_config/standard/altfoto.com.txt vendored Normal file → Executable file
View File

16
inc/3rdparty/site_config/standard/alumni.stanford.edu.txt vendored Normal file → Executable file
View File

@ -1,10 +1,10 @@
title: //h1 title: //h1
author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ") author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ")
date: //div/a[contains (@href, "issue")] date: //div/a[contains (@href, "issue")]
move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1] move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1]
body: //div[@class="enableBullets"] body: //div[@class="enableBullets"]
test_url: http://alumni.stanford.edu/get/page/magazine/article/?article_id=54819 test_url: http://alumni.stanford.edu/get/page/magazine/article/?article_id=54819

View File

@ -0,0 +1,6 @@
body: //div[@id='content']//div[contains(@class, 'content')]
strip_id_or_class: widget
strip: //a[contains(@href, 'upm_export=')]
test_url: http://amandala.com.bz/news/feed/
test_url: http://amandala.com.bz/news/poor-pse-results-30-raise/

36
inc/3rdparty/site_config/standard/amazon.com.txt vendored Normal file → Executable file
View File

@ -1,19 +1,19 @@
title: //span[@id = 'btAsinTitle'] title: //span[@id = 'btAsinTitle']
body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div
#strip_id_or_class: quantityDropdownDiv #strip_id_or_class: quantityDropdownDiv
#strip_id_or_class: addToCartSpan #strip_id_or_class: addToCartSpan
#strip_id_or_class: oneClickDiv #strip_id_or_class: oneClickDiv
strip_id_or_class: nocontent strip_id_or_class: nocontent
strip_id_or_class: masDynamicConten strip_id_or_class: masDynamicConten
strip_id_or_class: dynamic-content strip_id_or_class: dynamic-content
prune: no prune: no
find_string: <span id="actualPriceValue"> find_string: <span id="actualPriceValue">
replace_string: <span id="actualPriceValue"><br />Price: replace_string: <span id="actualPriceValue"><br />Price:
strip_id_or_class: collapsePS strip_id_or_class: collapsePS
strip_id_or_class: expandPS strip_id_or_class: expandPS
strip_id_or_class: psPlaceHolde strip_id_or_class: psPlaceHolde
strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')] strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')]
test_url: http://www.amazon.com/Common-Sense-Forestry-Living-Mother/dp/1931498210/ test_url: http://www.amazon.com/Common-Sense-Forestry-Living-Mother/dp/1931498210/

8
inc/3rdparty/site_config/standard/americandrink.net.txt vendored Normal file → Executable file
View File

@ -1,6 +1,6 @@
title: //div[@class='head']/h2/a title: //div[@class='head']/h2/a
author: //div[@class='head']/a author: //div[@class='head']/a
date: //div[@class='head']/p[@class='date']/a date: //div[@class='head']/p[@class='date']/a
body: //div[@class='copy'] body: //div[@class='copy']
strip: //p[@class='meta'] strip: //p[@class='meta']
test_url: http://americandrink.net/post/10567188712/free-the-hooch test_url: http://americandrink.net/post/10567188712/free-the-hooch

18
inc/3rdparty/site_config/standard/americascup.com.txt vendored Normal file → Executable file
View File

@ -1,10 +1,10 @@
title: //div[@class="editorial-content"]/h3 title: //div[@class="editorial-content"]/h3
body: //div[@class="hero-image" or @class="editorial-content"] body: //div[@class="hero-image" or @class="editorial-content"]
strip: //ul[@class="hero-caption"] strip: //ul[@class="hero-caption"]
strip_id_or_class: footer strip_id_or_class: footer
prune: no prune: no
tidy: no tidy: no
test_url: http://www.americascup.com/en/Latest/News/2012/3/Coutts-and-Peyron-tell-transformative-tale-at-Global-Sports-Forum/ test_url: http://www.americascup.com/en/Latest/News/2012/3/Coutts-and-Peyron-tell-transformative-tale-at-Global-Sports-Forum/

6
inc/3rdparty/site_config/standard/americastestkitchenfeed.com.txt vendored Normal file → Executable file
View File

@ -1,5 +1,5 @@
title: //h1[@class="post-title"] title: //h1[@class="post-title"]
author: //span[@class="author"]/a author: //span[@class="author"]/a
date: //span[@class="date"] date: //span[@class="date"]
body: //div[@class="post-content main"] body: //div[@class="post-content main"]
test_url: http://www.americastestkitchenfeed.com/gadgets-and-gear/2012/07/chill-out-with-tovolos-king-cube-silicone-ice-cube-tray/ test_url: http://www.americastestkitchenfeed.com/gadgets-and-gear/2012/07/chill-out-with-tovolos-king-cube-silicone-ice-cube-tray/

View File

@ -0,0 +1,8 @@
title: //title
body: //div[@class="entry-content"]
author: //span[@class="author vcard"]
date: //span[@class="entry-date"]
test_url: http://www.amptoons.com/blog/2013/03/14/open-thread-and-link-farm-i-hate-being-sick-edition/

20
inc/3rdparty/site_config/standard/anandtech.com.txt vendored Normal file → Executable file
View File

@ -1,11 +1,11 @@
author: //a[@class='b'][1] author: //a[@class='b'][1]
date: substring-after(substring-before(//div, 'Posted in'), ' on ') date: substring-after(substring-before(//div, 'Posted in'), ' on ')
strip_image_src: /content/images/globals/ strip_image_src: /content/images/globals/
strip: //h2[. = 'Page 1']/preceding::p strip: //h2[. = 'Page 1']/preceding::p
strip: //h2 strip: //h2
prune: no prune: no
single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/')) single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/'))
test_url: http://www.anandtech.com/show/5812/eurocom-monster-10-clevos-little-monster/ test_url: http://www.anandtech.com/show/5812/eurocom-monster-10-clevos-little-monster/

View File

@ -0,0 +1,5 @@
body: //div[@class='post_content']
date: //div[@class='date_day'] | div[@class='date_month']
test_url: http://www.androidpolice.com/2014/03/30/music-boss-for-pebble-can-now-control-playback-and-volume-on-chromecast-content-from-your-smartwatch/

16
inc/3rdparty/site_config/standard/andyrutledge.com.txt vendored Normal file → Executable file
View File

@ -1,9 +1,9 @@
title: //h2 title: //h2
author: string('Andy Rutledge') author: string('Andy Rutledge')
date: //div[@class='articledate'] date: //div[@class='articledate']
body: //div[@class='copybody'] body: //div[@class='copybody']
strip: //*[@class='space'] strip: //*[@class='space']
strip: //*[@class='articleFoot'] strip: //*[@class='articleFoot']
test_url: http://www.andyrutledge.com/hungry-for-a-better-menu.php test_url: http://www.andyrutledge.com/hungry-for-a-better-menu.php

14
inc/3rdparty/site_config/standard/annatravelling.wordpress.com.txt vendored Normal file → Executable file
View File

@ -1,9 +1,9 @@
title: //h1[@class="title"] title: //h1[@class="title"]
author: ("Anna Manasova") author: ("Anna Manasova")
# is ignored, unfortunately # is ignored, unfortunately
date: //p[@class="date"] date: //p[@class="date"]
body: //div[@class="entry"] body: //div[@class="entry"]
test_url: http://annatravelling.wordpress.com/2011/11/07/a-day-of-cooking-thai/ test_url: http://annatravelling.wordpress.com/2011/11/07/a-day-of-cooking-thai/

34
inc/3rdparty/site_config/standard/applature.com.txt vendored Normal file → Executable file
View File

@ -1,18 +1,18 @@
title: //h1[contains(@class, 'title')# title: //h1[contains(@class, 'title')#
body: //div[@id='mainContent']//div[contains(@class, 'section_content')] | //ul[@class='section_footer'] body: //div[@id='mainContent']//div[contains(@class, 'section_content')] | //ul[@class='section_footer']
date: //div[@class='date'] date: //div[@class='date']
strip_id_or_class: sharethis strip_id_or_class: sharethis
strip_id_or_class: stats strip_id_or_class: stats
strip_id_or_class: apply_form strip_id_or_class: apply_form
strip_id_or_class: job_map strip_id_or_class: job_map
strip_id_or_class: respond strip_id_or_class: respond
strip: //h1//span[@class='type'] strip: //h1//span[@class='type']
strip: //li[@class='print' or @class='map'] strip: //li[@class='print' or @class='map']
replace_string(<ul class="section_footer" style="display): <ul class="section_footer" style="display-bla replace_string(<ul class="section_footer" style="display): <ul class="section_footer" style="display-bla
prune: no prune: no
tidy: no tidy: no
test_url: http://applature.com/mining-jobs/jobs/nickel-west-leinster-analytical-laboratory-technician/ test_url: http://applature.com/mining-jobs/jobs/nickel-west-leinster-analytical-laboratory-technician/

12
inc/3rdparty/site_config/standard/apple.com.txt vendored Normal file → Executable file
View File

@ -1,7 +1,7 @@
strip: //p[@class='sosumi'] strip: //p[@class='sosumi']
# Aren't they witty? # Aren't they witty?
# I can't work out what causes the  before the title. # I can't work out what causes the  before the title.
title: //h1[@class='title'] title: //h1[@class='title']
strip: //h1[@class='title'] strip: //h1[@class='title']
test_url: http://www.apple.com/pr/library/2011/02/15appstore.html test_url: http://www.apple.com/pr/library/2011/02/15appstore.html

View File

@ -0,0 +1,4 @@
body: //div[contains(@class, 'articulum')]
test_url: http://www.appledaily.com.tw/realtimenews/article/new/20140120/330479
test_url: http://www.appledaily.com.tw/rss/create/kind/rnews/type/new/

34
inc/3rdparty/site_config/standard/appleinsider.com.txt vendored Normal file → Executable file
View File

@ -1,11 +1,23 @@
title: //p[@class='title'] title: //h1[@class="art-head"]
author: //p[text() = 'By ']/a/text() author: //p[contains(@class, 'byline')]/a
strip: //p[text() = 'By '] #author: //p[text() = 'By ']/a/text()
#strip: //p[text() = 'By ']
body: //td[@class='bod']
strip_id_or_class: title date: //p[contains(@class, 'date-header')]
strip_id_or_class: minor
body: //div[@class="article"]
strip_id_or_class: multipagefooter strip_id_or_class: lazy
test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html #strip_id_or_class: minor
strip_id_or_class: multipagefooter
strip_id_or_class: date-header
strip_id_or_class: byline
find_string: <noscript>
replace_string: <div>
find_string: </noscript>
replace_string: </div>
test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html
test_url: http://appleinsider.com/articles/13/10/03/goldee-companion-app-for-philips-hue-bulbs-offers-shifting-dynamic-light-scenes
test_url: http://appleinsider.com/appleinsider.rss

0
inc/3rdparty/site_config/standard/appleweblog.com.txt vendored Normal file → Executable file
View File

6
inc/3rdparty/site_config/standard/archdaily.com.txt vendored Normal file → Executable file
View File

@ -1,5 +1,5 @@
date: //div[@class='post_date'] date: //div[@class='post_date']
body: //div[@class='post_content'] body: //div[@class='post_content']
test_url: http://www.archdaily.com/185325/p10-mixed-use-building-studio-up test_url: http://www.archdaily.com/185325/p10-mixed-use-building-studio-up

38
inc/3rdparty/site_config/standard/archiveofourown.org.txt vendored Normal file → Executable file
View File

@ -1,18 +1,22 @@
# Description: Fix XPaths to include ALL chapters on 'view_full_work' pages. # Description: Fix XPaths to include ALL chapters on 'view_full_work' pages.
# Include: work meta, summary, chapter information, and notes which Instapaper strips out on default. # Include: work meta, summary, chapter information, and notes which Instapaper strips out on default.
# Exclude: header, footer, navigation, comments. # Exclude: header, footer, navigation, comments.
# Notes: User is a newbie with XPaths. # Notes: User is a newbie with XPaths.
title: //h2[@class='title'] title: //h2[@class='title']
author: //h3[@class='byline'] author: //h3[@class='byline']
author: //a[@class='login author'] author: //a[@class='login author']
strip_id_or_class:header strip_id_or_class:header
strip_id_or_class:navigation strip_id_or_class:navigation
strip_id_or_class:feedback strip_id_or_class:feedback
strip_id_or_class:kudos strip_id_or_class:kudos
strip_id_or_class:add_comment_placeholder strip_id_or_class:add_comment_placeholder
strip_id_or_class:add_comment strip_id_or_class:add_comment
strip_id_or_class:globalize strip_id_or_class:globalize
strip_id_or_class:footer strip_id_or_class:footer
test_url: http://archiveofourown.org/works/229402?view_full_work=true
single_page_link: //div[@id='main']//a[contains(@href, 'view_adult=true')]
test_url: http://archiveofourown.org/works/229402?view_full_work=true
test_url: http://archiveofourown.org/works/750111/chapters/1399929

33
inc/3rdparty/site_config/standard/arstechnica.com.txt vendored Normal file → Executable file
View File

@ -1,16 +1,17 @@
author: //p[@class='byline']/a author: //p[@class='byline']/a
body: //div[contains(@class,'article-content')] body: //div[contains(@class,'article-content')]
strip: //h2[@class='title'] strip: //h2[@class='title']
strip_id_or_class: byline strip_id_or_class: byline
prune: no strip_id_or_class: story-sidebar
prune: no
date: //div[@class='byline']/span[@class='posted']//abbr/@original-title
date: //div[@class='byline']/span[@class='posted']//abbr date: //div[@class='byline']/span[@class='posted']//abbr/@original-title
date: //div[@class='byline']/span[@class='posted']//abbr
title: //div[@id='story']//h2[@class='title']
title: //div[@id='story']//h2[@class='title']
strip: //div[@class='pager']
next_page_link: //nav//a[span/@class='next']/@href strip: //div[@class='pager']
next_page_link: //nav//a[span/@class='next']/@href
test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars
test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/ test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars
test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/

8
inc/3rdparty/site_config/standard/articles.boston.com.txt vendored Normal file → Executable file
View File

@ -1,6 +1,6 @@
title: //div[@class="mod-bostonarticleheader mod-articleheader"]/h1 title: //div[@class="mod-bostonarticleheader mod-articleheader"]/h1
author: substring-after(//div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[3],"By ") author: substring-after(//div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[3],"By ")
date: //div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[@class="pubdate"] date: //div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[@class="pubdate"]
strip_id_or_class: mod-pagination strip_id_or_class: mod-pagination
test_url: http://articles.boston.com/2011-10-23/news/30313691_1_bigfoot-free-speech-monadnock-state-park test_url: http://articles.boston.com/2011-10-23/news/30313691_1_bigfoot-free-speech-monadnock-state-park

18
inc/3rdparty/site_config/standard/articles.courant.com.txt vendored Normal file → Executable file
View File

@ -1,11 +1,11 @@
title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1 title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1
date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"] date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"]
author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3] author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3]
strip_id_or_class: mod-article-byline strip_id_or_class: mod-article-byline
strip_id_or_class: mod-article-header strip_id_or_class: mod-article-header
strip_id_or_class: mod-article-subtitle strip_id_or_class: mod-article-subtitle
#This leaves some crud after the article, but it's better than nothing. #This leaves some crud after the article, but it's better than nothing.
#It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element. #It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element.
test_url: http://articles.courant.com/2011-10-22/news/hc-green-drugsearch--1022-20111022_1_drugs-in-student-lockers-police-dogs-lockdown test_url: http://articles.courant.com/2011-10-22/news/hc-green-drugsearch--1022-20111022_1_drugs-in-student-lockers-police-dogs-lockdown

View File

@ -0,0 +1,11 @@
body: //div[contains(@class, "article_body")]
# print view
body: //div[@id='print_facet']//div[@id='body']
tidy: no
prune: no
single_page_link: concat(substring-before(//div[@id="echo_container_a"]/@guid, '_story.html'), '_print.html')
test_url: http://articles.washingtonpost.com/2011-10-22/world/35279694_1_germany-acts-german-leaders-chancellor-angela-merkel
test_url: http://articles.washingtonpost.com/2013-05-31/opinions/39658000_1_chemical-weapons-mass-destruction-cartels

2
inc/3rdparty/site_config/standard/asahi.com.txt vendored Normal file → Executable file
View File

@ -1,3 +1,3 @@
body: //div[@id='HeadLine'] body: //div[@id='HeadLine']
strip: //div[@id='utility_right'] strip: //div[@id='utility_right']
test_url: http://www.asahi.com/culture/update/0520/TKY201105200321.html test_url: http://www.asahi.com/culture/update/0520/TKY201105200321.html

6
inc/3rdparty/site_config/standard/ascarter.net.txt vendored Normal file → Executable file
View File

@ -1,5 +1,5 @@
title: //h1[@class='article_title'] title: //h1[@class='article_title']
author: //span[@class='author'] author: //span[@class='author']
date: //h2[@class='dateline'] date: //h2[@class='dateline']
body: //div[@class='article_body'] body: //div[@class='article_body']
test_url: http://ascarter.net/2012/02/20/enough-is-enough.html test_url: http://ascarter.net/2012/02/20/enough-is-enough.html

10
inc/3rdparty/site_config/standard/astronews.com.txt vendored Normal file → Executable file
View File

@ -1,7 +1,7 @@
title: //span[@class='titel'] title: //span[@class='titel']
author: //span[@class='metadaten_C']/a//span[@class='metadaten_C'] author: //span[@class='metadaten_C']/a//span[@class='metadaten_C']
date: substring-after(//span[@class='metadaten_C'],'astronews.com') date: substring-after(//span[@class='metadaten_C'],'astronews.com')
strip: //span[@class='bu'] strip: //span[@class='bu']
strip_image_src: '/_images/' strip_image_src: '/_images/'
test_url: http://www.astronews.com/news/artikel/2011/10/1110-021.shtml test_url: http://www.astronews.com/news/artikel/2011/10/1110-021.shtml

12
inc/3rdparty/site_config/standard/asymco.com.txt vendored Normal file → Executable file
View File

@ -1,8 +1,8 @@
# Johannes Stühler # Johannes Stühler
title://h2 title://h2
author://span[@class='meta-content'] author://span[@class='meta-content']
date://abbr[@class='date published']/@title date://abbr[@class='date published']/@title
body://div[@class='entry-content'] body://div[@class='entry-content']
test_url: http://www.asymco.com/2011/01/14/is-android-more-efficient-than-ios-at-generating-search-revenue/ test_url: http://www.asymco.com/2011/01/14/is-android-more-efficient-than-ios-at-generating-search-revenue/

8
inc/3rdparty/site_config/standard/autoblog.com.txt vendored Normal file → Executable file
View File

@ -1,6 +1,6 @@
prune: no prune: no
body: //div[@class='post-body'] body: //div[@class='post-body']
author: //p[@class='byline']//a author: //p[@class='byline']//a
date: substring-after(//div[@class='about']/p[2], 'Posted') date: substring-after(//div[@class='about']/p[2], 'Posted')
strip: //div[@class='body']/div[@class='meta'] strip: //div[@class='body']/div[@class='meta']
test_url: http://www.autoblog.com/2012/01/17/next-gen-bmw-x5-caught-again/ test_url: http://www.autoblog.com/2012/01/17/next-gen-bmw-x5-caught-again/

4
inc/3rdparty/site_config/standard/avclub.com.txt vendored Normal file → Executable file
View File

@ -1,4 +1,4 @@
author: //*[@id="article_wrapper"]/div[1]/a[1] author: //*[@id="article_wrapper"]/div[1]/a[1]
body: //*[@id="article_wrapper"]/div[2] body: //*[@id="article_wrapper"]/div[2]
date: //*[@id="article_wrapper"]/div[1]/text()[2] date: //*[@id="article_wrapper"]/div[1]/text()[2]
test_url: http://www.avclub.com/articles/forgetmenot,70904 test_url: http://www.avclub.com/articles/forgetmenot,70904

20
inc/3rdparty/site_config/standard/baltimoresun.com.txt vendored Normal file → Executable file
View File

@ -1,12 +1,12 @@
single_page_link: //div[@class='toppaginate']//a[@rel='nofollow'] single_page_link: //div[@class='toppaginate']//a[@rel='nofollow']
convert_double_br_tags: yes convert_double_br_tags: yes
title: //div[@class="story"]/h1 title: //div[@class="story"]/h1
body: //div[@id="story-body-text"] body: //div[@id="story-body-text"]
author: //span[@class="byline"] author: //span[@class="byline"]
date: //p[@class="date"] date: //p[@class="date"]
strip: //*[@class='all'] strip: //*[@class='all']
strip: //*[@class='articlerail'] strip: //*[@class='articlerail']
test_url: http://www.baltimoresun.com/news/maryland/bs-md-omalley-budget-2-20120116,0,5340585.story test_url: http://www.baltimoresun.com/news/maryland/bs-md-omalley-budget-2-20120116,0,5340585.story

View File

@ -0,0 +1,13 @@
title: //h1[@class='title']
author: //p[@class="author"]/a[1]
body: //div[@class="article"]
date: //p[@class="date"]
# remove user tools
strip: //div[@class='tools']
strip: //h1
strip: //h2[@class='subtitle']
strip: //p[@class='author']
strip: //p[@class='date']
test_url: http://www.baseballprospectus.com/article.php?articleid=18463

10
inc/3rdparty/site_config/standard/basicthinking.de.txt vendored Normal file → Executable file
View File

@ -1,7 +1,7 @@
title: //h2 title: //h2
date: //span[@class='date'] date: //span[@class='date']
body: //div[@class='entry'] body: //div[@class='entry']
strip: //div[@class='zusatz'] strip: //div[@class='zusatz']
test_url: http://www.basicthinking.de/blog/2011/12/13/sagt-social-networks-adieu-begrust-private-networks/ test_url: http://www.basicthinking.de/blog/2011/12/13/sagt-social-networks-adieu-begrust-private-networks/

22
inc/3rdparty/site_config/standard/bb.is.txt vendored Normal file → Executable file
View File

@ -1,13 +1,13 @@
author: substring(//h3[@class='headlines']/span[@class='dates'],0,string-length(//h3[@class='headlines']/span[@class='dates'])-20) author: substring(//h3[@class='headlines']/span[@class='dates'],0,string-length(//h3[@class='headlines']/span[@class='dates'])-20)
date: substring((//h3[@class='headlines']/span[@class='dates']),string-length(//h3[@class='headlines']/span[@class='dates'])-18,12) date: substring((//h3[@class='headlines']/span[@class='dates']),string-length(//h3[@class='headlines']/span[@class='dates'])-18,12)
body: //div[@class='first-article-big'] body: //div[@class='first-article-big']
strip: //table[@class='newsimagecontainer'] strip: //table[@class='newsimagecontainer']
strip: //h3[@class='headlines'] strip: //h3[@class='headlines']
strip: //iframe[@class='headlines'] strip: //iframe[@class='headlines']
strip: //a[@class='newslink'] strip: //a[@class='newslink']
convert_double_br_tags: yes convert_double_br_tags: yes
test_url: http://bb.is/Pages/82?NewsID=174119 test_url: http://bb.is/Pages/82?NewsID=174119

74
inc/3rdparty/site_config/standard/bbc.co.uk.txt vendored Normal file → Executable file
View File

@ -1,32 +1,42 @@
body: //div[@class="story-body"] body: //div[@class="story-body"]
title: //h1[@class="story-header"] # for video entries
date: //span[@class="story-date"]/span[@class='date'] body: //div[contains(@class, "videoInStory") or @id="meta-information"]
title: //h1[@class="story-header"]
# recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055 date: //span[@class="story-date"]/span[@class='date']
body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1'] # for sport site
date: //meta[@name='DCTERMS.created']/@content
#strip: //div[@class="story-feature narrow"] author: //div[@id='headline']//span[@class='byline-name']
#strip: //div[@class="story-feature wide"]
#strip: //div[@class="story-feature dslideshow-enclosure"] # recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055
strip: //div[contains(@class, "story-feature")] body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1']
strip: //span[@class="story-date"]
#strip: //div[@class="caption body-narrow-width"] #strip: //div[@class="story-feature narrow"]
strip: //div[@class="warning"]//p #strip: //div[@class="story-feature wide"]
strip: //div[@id='page-bookmark-links-head'] #strip: //div[@class="story-feature dslideshow-enclosure"]
strip: //object strip: //div[contains(@class, "story-feature")]
strip: //div[contains(@class, "bbccom_advert_placeholder")] strip: //span[@class="story-date"]
strip: //div[contains(@class, "embedded-hyper")] #strip: //div[@class="caption body-narrow-width"]
strip: //div[contains(@class, 'market-data')] strip: //div[@class="warning"]//p
strip: //a[contains(@class, 'hidden')] strip: //div[@id='page-bookmark-links-head']
strip: //div[contains(@class, 'hypertabs')] strip: //object
strip: //div[contains(@class, 'related')] strip: //div[contains(@class, "bbccom_advert_placeholder")]
strip: //form[@id='comment-form'] strip: //div[contains(@class, "embedded-hyper")]
strip: //div[contains(@class, 'comment-introduction')] strip: //div[contains(@class, 'market-data')]
strip: //a[contains(@class, 'hidden')]
replace_string(<noscript>): <div> strip: //div[contains(@class, 'hypertabs')]
replace_string(</noscript>): </div> strip: //div[contains(@class, 'related')]
strip: //form[@id='comment-form']
prune: no strip: //div[contains(@class, 'comment-introduction')]
strip: //div[contains(@class, 'share-tools')]
dissolve: //h2 strip: //div[@id='also-related-links']
test_url: http://www.bbc.co.uk/news/business-15060862
replace_string(<noscript>): <div>
replace_string(</noscript>): </div>
prune: no
dissolve: //h2
test_url: http://www.bbc.co.uk/sport/0/football/23224017
test_url: http://www.bbc.co.uk/news/business-15060862
# video entry
test_url: http://www.bbc.co.uk/news/world-asia-22056933

View File

@ -0,0 +1,16 @@
title: //header//h1
#body: //article[contains(@class, 'node-full')]
body: //div[contains(@class, 'recipe-details') or contains(@class, 'tips-carousel')] | //section[@id='recipe-ingredients' or @id='recipe-method']
strip_id_or_class: recipe-rating-wrapper
strip_id_or_class: magazine-subcribe-header
strip_id_or_class: hide
strip_id_or_class: recipe-actions
strip_id_or_class: buy-ingredients
strip_id_or_class: related-content
strip_id_or_class: recipe-magazine-ad
strip_id_or_class: copy-right
prune: no
test_url: http://www.bbcgoodfood.com/recipes/1131634/minced-beef-wellington

28
inc/3rdparty/site_config/standard/benoitmaison.org.txt vendored Normal file → Executable file
View File

@ -1,16 +1,16 @@
body: //div[@class="entry-content"] body: //div[@class="entry-content"]
# Remove text &lsquo;Tweet&rsquo; # Remove text &lsquo;Tweet&rsquo;
strip: //div[@class="entry-content"]/div[last()] strip: //div[@class="entry-content"]/div[last()]
title: h1[@class="entry-title"] title: h1[@class="entry-title"]
# If the Instapaper text parser worked with HTML5 tags, we would use: # If the Instapaper text parser worked with HTML5 tags, we would use:
date: //time[@class="entry-date"] date: //time[@class="entry-date"]
# But since it does not, use this more complicated rule: # But since it does not, use this more complicated rule:
date: //div[@class="entry-meta"]/a[@rel="bookmark"] date: //div[@class="entry-meta"]/a[@rel="bookmark"]
# Unfortunately, the following rule is overridden by the automatically found author. # Unfortunately, the following rule is overridden by the automatically found author.
author: ("Benoit Maison") author: ("Benoit Maison")
test_url: http://www.benoitmaison.org/2011/12/06/why-siri-had-to-start-in-beta/ test_url: http://www.benoitmaison.org/2011/12/06/why-siri-had-to-start-in-beta/

2
inc/3rdparty/site_config/standard/berlingske.dk.txt vendored Normal file → Executable file
View File

@ -1,3 +1,3 @@
title: //h1[@class='headline'] title: //h1[@class='headline']
body: //div[contains(@class, 'article-wrapper')] body: //div[contains(@class, 'article-wrapper')]
test_url: http://www.berlingske.dk/danmark/festen-er-flyttet-nordpaa test_url: http://www.berlingske.dk/danmark/festen-er-flyttet-nordpaa

View File

@ -0,0 +1,5 @@
body: //div[contains(@class, "NewsText"]
prune: no
test_url: http://www.bernama.com/bernama/v7/rss/english.php
test_url: http://www.bernama.com/bernama/v7/newsindex.php?id=943513

0
inc/3rdparty/site_config/standard/betabeat.com.txt vendored Normal file → Executable file
View File

10
inc/3rdparty/site_config/standard/betanews.com.txt vendored Normal file → Executable file
View File

@ -1,7 +1,7 @@
# some articles at this site like this one doesn't # some articles at this site like this one doesn't
# seem to pick up the article body via normal # seem to pick up the article body via normal
# processing, other articles come through fine # processing, other articles come through fine
# http://www.betanews.com/joewilcox/article # http://www.betanews.com/joewilcox/article
# /Google-is-a-marketing-sensation/1309708375 # /Google-is-a-marketing-sensation/1309708375
body: //*[@id="article"] body: //*[@id="article"]
test_url: http://www.betanews.com/joewilcox/article/Google-is-a-marketing-sensation/1309708375 test_url: http://www.betanews.com/joewilcox/article/Google-is-a-marketing-sensation/1309708375

12
inc/3rdparty/site_config/standard/biography.com.txt vendored Normal file → Executable file
View File

@ -1,8 +1,8 @@
title: //div[contains(@class, 'main-content')]//h1 title: //div[contains(@class, 'main-content')]//h1
body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')] body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')]
prune: no prune: no
single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')] single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')]
test_url: http://www.biography.com/print/profile/martin-luther-9389283 test_url: http://www.biography.com/print/profile/martin-luther-9389283

0
inc/3rdparty/site_config/standard/bitelia.com.txt vendored Normal file → Executable file
View File

View File

@ -0,0 +1,13 @@
date: //meta[@name='publish-date']/@content
body: //div[contains(@class, 'articleContentWrapper')]
prune: no
strip: //div[contains(@class, 'staff_info')]//dd[contains(., 'Twitter')]
strip_id_or_class: related_content
strip_id_or_class: enlarge
strip_id_or_class: photoBy
strip_id_or_class: older
test_url: http://www.bizjournals.com/cincinnati/news/2013/10/03/harris-teeter-shareholders-vote-on.html
test_url: http://feeds.bizjournals.com/industry_20?format=xml

10
inc/3rdparty/site_config/standard/bjango.com.txt vendored Normal file → Executable file
View File

@ -1,7 +1,7 @@
title: //h1[@class='articlehead'] title: //h1[@class='articlehead']
body: //div[@class='column'] body: //div[@class='column']
strip: //h1 strip: //h1
strip: //div[@class='help'] strip: //div[@class='help']
#no author or date/time provided in current layout #no author or date/time provided in current layout
test_url: http://bjango.com/articles/actions/ test_url: http://bjango.com/articles/actions/

12
inc/3rdparty/site_config/standard/blog.arsln.org.txt vendored Normal file → Executable file
View File

@ -1,8 +1,8 @@
tidy: no tidy: no
prune: no prune: no
date: //article/header/h6/time date: //article/header/h6/time
title: //article/header/h3 title: //article/header/h3
author: //meta[@name='author']/@content author: //meta[@name='author']/@content
body: //article//post body: //article//post
test_url: http://blog.arsln.org/aska-ayip-oluyor/ test_url: http://blog.arsln.org/aska-ayip-oluyor/

10
inc/3rdparty/site_config/standard/blog.asmartbear.com.txt vendored Normal file → Executable file
View File

@ -1,7 +1,7 @@
title: //title title: //title
author: //span[@class='author vcard']/a author: //span[@class='author vcard']/a
date: //p[@class='headline_meta']/abbr[@class='published'] date: //p[@class='headline_meta']/abbr[@class='published']
body: //div[@class='format_text entry-content'] body: //div[@class='format_text entry-content']
strip: //div[@id='dd_ajax_float'] strip: //div[@id='dd_ajax_float']
test_url: http://blog.asmartbear.com/how-to-get-quality-freelance-graphics-design-work-on-a-budget.html test_url: http://blog.asmartbear.com/how-to-get-quality-freelance-graphics-design-work-on-a-budget.html

14
inc/3rdparty/site_config/standard/blog.cloudflare.com.txt vendored Normal file → Executable file
View File

@ -1,9 +1,9 @@
# Instapaper gets this back to front and only gets the blog title instead of the article title. # Instapaper gets this back to front and only gets the blog title instead of the article title.
title: substring-before(//title, '-') title: substring-before(//title, '-')
author: //a[ contains(@href, '/people') ] author: //a[ contains(@href, '/people') ]
body: //div[ @class='post' ] body: //div[ @class='post' ]
# Date is impossible to retrieve since they use those stupid "fuzzy" dates, inserted through javascript, at posterous. # Date is impossible to retrieve since they use those stupid "fuzzy" dates, inserted through javascript, at posterous.
test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n

6
inc/3rdparty/site_config/standard/blog.fefe.de.txt vendored Normal file → Executable file
View File

@ -1,5 +1,5 @@
title: //h2 title: //h2
date: //h3 date: //h3
body: //ul body: //ul
test_url: http://blog.fefe.de/?ts=b063bf55 test_url: http://blog.fefe.de/?ts=b063bf55

18
inc/3rdparty/site_config/standard/blog.instagram.com.txt vendored Normal file → Executable file
View File

@ -1,11 +1,11 @@
# clean Instagram blog a little bit # clean Instagram blog a little bit
tidy:no tidy:no
prune:no prune:no
body://div[contains(@id,'content')] body://div[contains(@id,'content')]
strip_id_or_class:meta strip_id_or_class:meta
strip_id_or_class:notes strip_id_or_class:notes
strip_id_or_class:pagination strip_id_or_class:pagination
test_url: http://blog.instagram.com/post/8757832007/fromwhereistand test_url: http://blog.instagram.com/post/8757832007/fromwhereistand

View File

@ -0,0 +1,9 @@
author: //a[@href="http://www.marco.org/about"]
date: //span[@class="date"]
# Remove the date from article body.
strip: //span[@class="date"]
# Remove pagination links from article body.
strip: //div[@id="pagination"]
test_url: http://blog.instapaper.com/post/31303984531

4
inc/3rdparty/site_config/standard/blog.jaysalvat.com.txt vendored Normal file → Executable file
View File

@ -1,4 +1,4 @@
date: //span[contains(@class, 'date-links')] date: //span[contains(@class, 'date-links')]
author: //span[contains(@class, 'author-links')] author: //span[contains(@class, 'author-links')]
body: //div[contains(@class, 'entry-content')] body: //div[contains(@class, 'entry-content')]
test_url: http://blog.jaysalvat.com/article/celui-qui-avait-refait-son-site-web test_url: http://blog.jaysalvat.com/article/celui-qui-avait-refait-son-site-web

6
inc/3rdparty/site_config/standard/blog.kaelig.fr.txt vendored Normal file → Executable file
View File

@ -1,5 +1,5 @@
body: //*[contains(@class, 'post_content')] body: //*[contains(@class, 'post_content')]
author: string('Kaelig Deloumeau-Prigent') author: string('Kaelig Deloumeau-Prigent')
title: //h1[@class='title'] title: //h1[@class='title']
date: //span[@class='date'] date: //span[@class='date']
test_url: http://blog.kaelig.fr/post/24877648508/preprocesseurs-css-renoncer-par-choix-ou-par test_url: http://blog.kaelig.fr/post/24877648508/preprocesseurs-css-renoncer-par-choix-ou-par

8
inc/3rdparty/site_config/standard/blog.naver.com.txt vendored Normal file → Executable file
View File

@ -1,6 +1,6 @@
title: //span[@class='pcol1 itemSubjectBoldfont'] title: //span[@class='pcol1 itemSubjectBoldfont']
body: //div[@id='postListBody'] body: //div[@id='postListBody']
date: //p[@class='date fil5 pcol2'] date: //p[@class='date fil5 pcol2']
single_page_link: /html/frameset/frame[1]/attribute::src single_page_link: /html/frameset/frame[1]/attribute::src
strip: //div[@class='post-btn'] strip: //div[@class='post-btn']
test_url: http://blog.naver.com/how2invest/110135068757 test_url: http://blog.naver.com/how2invest/110135068757

20
inc/3rdparty/site_config/standard/blog.pchome.net.txt vendored Normal file → Executable file
View File

@ -1,12 +1,12 @@
# PCHOME blog, a popular Chinese blog host # PCHOME blog, a popular Chinese blog host
# Oct 15, 2011 # Oct 15, 2011
# #
title://*[contains(@class,'imp')]/h2 title://*[contains(@class,'imp')]/h2
date://*[contains(@class,'imp')]/span date://*[contains(@class,'imp')]/span
body://div[contains(@id,'blog_content')] body://div[contains(@id,'blog_content')]
test_url: http://blog.pchome.net/article/462502.html test_url: http://blog.pchome.net/article/462502.html

8
inc/3rdparty/site_config/standard/blog.pinboard.in.txt vendored Normal file → Executable file
View File

@ -1,6 +1,6 @@
title: //a[@class="blog_title"] title: //a[@class="blog_title"]
date: //p[@class="when"]/a date: //p[@class="when"]/a
body: //div[@class="blog_entry"] body: //div[@class="blog_entry"]
strip_id_or_class:blog_title strip_id_or_class:blog_title
strip_id_or_class:when strip_id_or_class:when
test_url: http://blog.pinboard.in/2011/11/the_social_graph_is_neither/ test_url: http://blog.pinboard.in/2011/11/the_social_graph_is_neither/

View File

@ -0,0 +1,11 @@
# This filter is tested on:
# http://blog.renren.com/share/224959024/14260739544
# http://blog.renren.com/share/231323504/14261768898
# http://blog.renren.com/share/230305019/1502806705
title://h1[contains(@class, 'title-article')]
author://span[contains(@class, 'name')]
body://div[contains(@class, 'content-body')]
convert_double_br_tags:yes
test_url: http://blog.renren.com/share/230305019/1502806705

50
inc/3rdparty/site_config/standard/blog.sina.com.cn.txt vendored Normal file → Executable file
View File

@ -1,26 +1,26 @@
# Sina blog, the most popular blog host in China. # Sina blog, the most popular blog host in China.
# Its source code is horrible. # Its source code is horrible.
# #
# Issue: # Issue:
# Only the first image in the article is displayed. # Only the first image in the article is displayed.
# The rest images are replace by a 1x1 transparent gif by sina blog host. # The rest images are replace by a 1x1 transparent gif by sina blog host.
# #
title://*[contains(@class,'titName SG_txta')] title://*[contains(@class,'titName SG_txta')]
author://*[contains(@id,'ownernick')] author://*[contains(@id,'ownernick')]
date://*[contains(@class,'time SG_txtc')] date://*[contains(@class,'time SG_txtc')]
body://div[contains(@class,'articalContent')] body://div[contains(@class,'articalContent')]
# Remove redundant content which has span class start with "MASS" # Remove redundant content which has span class start with "MASS"
# Example <span class="MASSf21674ffeef7"></span> # Example <span class="MASSf21674ffeef7"></span>
strip://span[contains(@class,'MASS')] strip://span[contains(@class,'MASS')]
# Remove comment # Remove comment
strip://div[contains(@class,'allComm')] strip://div[contains(@class,'allComm')]
# Remove hiden text and link # Remove hiden text and link
strip://ins strip://ins
tidy:no tidy:no
convert_double_br_tags:yes convert_double_br_tags:yes
test_url: http://blog.sina.com.cn/s/blog_5054769e0102dtja.html test_url: http://blog.sina.com.cn/s/blog_5054769e0102dtja.html

0
inc/3rdparty/site_config/standard/blog.spu.edu.txt vendored Normal file → Executable file
View File

10
inc/3rdparty/site_config/standard/blog.wells.ee.txt vendored Normal file → Executable file
View File

@ -1,6 +1,6 @@
title: //h2/a[@class="no-link title"] title: //h2/a[@class="no-link title"]
author: //h2[@id="blog_owner"] author: //h2[@id="blog_owner"]
date: //time date: //time
strip: //h2/a[@class="no-link title"] strip: //h2/a[@class="no-link title"]
test_url: http://blog.wells.ee/retina test_url: http://blog.wells.ee/retina
test_url: http://blog.wells.ee/skeuomorphism test_url: http://blog.wells.ee/skeuomorphism

12
inc/3rdparty/site_config/standard/blogs.aljazeera.net.txt vendored Normal file → Executable file
View File

@ -1,8 +1,8 @@
# 2011-08-23 [carlo@...] Initial version. # 2011-08-23 [carlo@...] Initial version.
author: //div[@id="blogauthordatebox-node"]//a[@title="View user profile."]/text() author: //div[@id="blogauthordatebox-node"]//a[@title="View user profile."]/text()
# why yes, I do feel a bit dirty # why yes, I do feel a bit dirty
date: substring-before( substring-after( substring-after( //div[@id="blogauthordatebox-node"]//td[3], "on " ), ", "), " " ) date: substring-before( substring-after( substring-after( //div[@id="blogauthordatebox-node"]//td[3], "on " ), ", "), " " )
test_url: http://blogs.aljazeera.net/asia/2011/08/22/peoples-hero test_url: http://blogs.aljazeera.net/asia/2011/08/22/peoples-hero

0
inc/3rdparty/site_config/standard/blogs.forbes.com.txt vendored Normal file → Executable file
View File

6
inc/3rdparty/site_config/standard/blogs.hbr.org.txt vendored Normal file → Executable file
View File

@ -1,4 +1,4 @@
title: //div[@id='pageFeature']/h1 title: //div[@id='pageFeature']/h1
body: //div[@id='articleBody'] body: //div[@id='articleBody']
strip: //div[@class='module wide'] strip: //div[@class='module wide']
test_url: http://blogs.hbr.org/bregman/2011/04/the-1-killer-of-meetings-and-w.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+harvardbusiness+%28HBR.org%29 test_url: http://blogs.hbr.org/bregman/2011/04/the-1-killer-of-meetings-and-w.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+harvardbusiness+%28HBR.org%29

Some files were not shown because too many files have changed in this diff Show More