mirror of
https://github.com/moparisthebest/wallabag
synced 2024-11-27 03:12:21 -05:00
updated specific configuration for parsing
This commit is contained in:
parent
58dbe10388
commit
4e067ceabd
8
inc/3rdparty/site_config/standard/24ways.org.txt
vendored
Normal file → Executable file
8
inc/3rdparty/site_config/standard/24ways.org.txt
vendored
Normal file → Executable file
@ -1,6 +1,6 @@
|
||||
title: //div[@class='meta']/h2/a
|
||||
author: //div[@class='meta']/h2/following-sibling::p/a/text()
|
||||
date://div[@class='meta']/h2/strong
|
||||
body: //div[@id='article']
|
||||
title: //div[@class='meta']/h2/a
|
||||
author: //div[@class='meta']/h2/following-sibling::p/a/text()
|
||||
date://div[@class='meta']/h2/strong
|
||||
body: //div[@id='article']
|
||||
strip: //div[@class='domore']
|
||||
test_url: http://24ways.org/2011/composing-the-new-canon
|
8
inc/3rdparty/site_config/standard/36kr.com.txt
vendored
Executable file
8
inc/3rdparty/site_config/standard/36kr.com.txt
vendored
Executable file
@ -0,0 +1,8 @@
|
||||
title: //h1[contains(@class, 'entry-title')]
|
||||
date: //meta[@name='weibo: article:create_at']/@content
|
||||
body: //div[contains(@class, 'mainContent')]
|
||||
strip_id_or_class: related_topics
|
||||
|
||||
prune: no
|
||||
|
||||
test_url: http://www.36kr.com/p/207879.html
|
8
inc/3rdparty/site_config/standard/37signals.com.txt
vendored
Normal file → Executable file
8
inc/3rdparty/site_config/standard/37signals.com.txt
vendored
Normal file → Executable file
@ -1,6 +1,6 @@
|
||||
title: //div[@class='post_header']//h2/a
|
||||
author: //span[@class='author']
|
||||
date: //span[@class='date']
|
||||
body: //div[@id='Content']
|
||||
title: //div[@class='post_header']//h2/a
|
||||
author: //span[@class='author']
|
||||
date: //span[@class='date']
|
||||
body: //div[@id='Content']
|
||||
|
||||
test_url: http://37signals.com/svn/posts/2785-the-end-of-the-it-department
|
16
inc/3rdparty/site_config/standard/3quarksdaily.com.txt
vendored
Normal file → Executable file
16
inc/3rdparty/site_config/standard/3quarksdaily.com.txt
vendored
Normal file → Executable file
@ -1,9 +1,9 @@
|
||||
body: //div[@class='content']
|
||||
date: //div[@class='content']/h2
|
||||
strip: //div[@class='content']/h2
|
||||
title: //div[@class='content']/h3
|
||||
|
||||
strip: //div[@id='postmenu']
|
||||
strip: //div[@class='trackback']
|
||||
tidy: no
|
||||
body: //div[@class='content']
|
||||
date: //div[@class='content']/h2
|
||||
strip: //div[@class='content']/h2
|
||||
title: //div[@class='content']/h3
|
||||
|
||||
strip: //div[@id='postmenu']
|
||||
strip: //div[@class='trackback']
|
||||
tidy: no
|
||||
test_url: http://www.3quarksdaily.com/3quarksdaily/2012/01/martin-luther-king-i-have-a-dream.html
|
0
inc/3rdparty/site_config/standard/3voor12.vpro.nl.txt
vendored
Normal file → Executable file
0
inc/3rdparty/site_config/standard/3voor12.vpro.nl.txt
vendored
Normal file → Executable file
4
inc/3rdparty/site_config/standard/43folders.com.txt
vendored
Normal file → Executable file
4
inc/3rdparty/site_config/standard/43folders.com.txt
vendored
Normal file → Executable file
@ -1,4 +1,4 @@
|
||||
body: //*[@class = 'content']
|
||||
author: //*[@class = 'submitted']/a
|
||||
body: //*[@class = 'content']
|
||||
author: //*[@class = 'submitted']/a
|
||||
date: substring-after(//*[@class = 'submitted']/text(), '|')
|
||||
test_url: http://www.43folders.com/2011/04/22/cranking
|
50
inc/3rdparty/site_config/standard/500px.com.txt
vendored
Normal file → Executable file
50
inc/3rdparty/site_config/standard/500px.com.txt
vendored
Normal file → Executable file
@ -1,27 +1,27 @@
|
||||
# very loose setup for both 500px.com/photo/* and 500px.com/blog/*
|
||||
# photo page example: http://500px.com/photo/4181666
|
||||
# blog page example: http://500px.com/blog/110
|
||||
|
||||
# avoid "no text" error
|
||||
tidy:no
|
||||
prune:no
|
||||
|
||||
# reorganize photo page elements
|
||||
#body://div[contains(@class,'container')]
|
||||
move_into(body)://div[contains(@id,'thephoto')]
|
||||
move_into(body)://div[contains(@id,'description')]
|
||||
move_into(body)://div[contains(@id,'tags')]
|
||||
move_into(body)://div[contains(@id,'photo-info')]
|
||||
|
||||
# clean photo page info
|
||||
strip://span[contains(@id,'copyright')]
|
||||
strip://*[contains(@id,'store')]
|
||||
strip://*[contains(@id,'user-info')]
|
||||
strip://*[contains(@id,'photo-stats')]
|
||||
strip://*[contains(@id,'voting_controls_container')]
|
||||
strip://*[contains(@id,'more-photos')]
|
||||
strip://*[contains(@id,'embed-photo')]
|
||||
|
||||
# clean blog page side bar
|
||||
# very loose setup for both 500px.com/photo/* and 500px.com/blog/*
|
||||
# photo page example: http://500px.com/photo/4181666
|
||||
# blog page example: http://500px.com/blog/110
|
||||
|
||||
# avoid "no text" error
|
||||
tidy:no
|
||||
prune:no
|
||||
|
||||
# reorganize photo page elements
|
||||
#body://div[contains(@class,'container')]
|
||||
move_into(body)://div[contains(@id,'thephoto')]
|
||||
move_into(body)://div[contains(@id,'description')]
|
||||
move_into(body)://div[contains(@id,'tags')]
|
||||
move_into(body)://div[contains(@id,'photo-info')]
|
||||
|
||||
# clean photo page info
|
||||
strip://span[contains(@id,'copyright')]
|
||||
strip://*[contains(@id,'store')]
|
||||
strip://*[contains(@id,'user-info')]
|
||||
strip://*[contains(@id,'photo-stats')]
|
||||
strip://*[contains(@id,'voting_controls_container')]
|
||||
strip://*[contains(@id,'more-photos')]
|
||||
strip://*[contains(@id,'embed-photo')]
|
||||
|
||||
# clean blog page side bar
|
||||
strip://*[contains(@class,'col d3 clearafter')]
|
||||
test_url: http://500px.com/photo/3641041?from=editors
|
0
inc/3rdparty/site_config/standard/512pixels.net.txt
vendored
Normal file → Executable file
0
inc/3rdparty/site_config/standard/512pixels.net.txt
vendored
Normal file → Executable file
14
inc/3rdparty/site_config/standard/5by5.tv.txt
vendored
Normal file → Executable file
14
inc/3rdparty/site_config/standard/5by5.tv.txt
vendored
Normal file → Executable file
@ -1,9 +1,9 @@
|
||||
body: //*[@id="episode"]
|
||||
prune: no
|
||||
tidy: no
|
||||
|
||||
autodetect_next_page: no
|
||||
strip_id_or_class: player
|
||||
|
||||
body: //*[@id="episode"]
|
||||
prune: no
|
||||
tidy: no
|
||||
|
||||
autodetect_next_page: no
|
||||
strip_id_or_class: player
|
||||
|
||||
strip://*[@id="header"]
|
||||
test_url: http://5by5.tv/buildanalyze/60
|
7
inc/3rdparty/site_config/standard/7newsbelize.com.txt
vendored
Executable file
7
inc/3rdparty/site_config/standard/7newsbelize.com.txt
vendored
Executable file
@ -0,0 +1,7 @@
|
||||
title: //*[@id='sstitle']
|
||||
body: //div[@id='sstory']
|
||||
strip_id_or_class: newsoptions
|
||||
prune: no
|
||||
|
||||
test_url: http://www.7newsbelize.com/sstory.php?nid=25654
|
||||
test_url: http://www.7newsbelize.com/7news.xml
|
14
inc/3rdparty/site_config/standard/944.com.txt
vendored
Normal file → Executable file
14
inc/3rdparty/site_config/standard/944.com.txt
vendored
Normal file → Executable file
@ -1,9 +1,9 @@
|
||||
title: //h2[@class='border']
|
||||
body: //div[@class='padding']
|
||||
|
||||
convert_double_br_tags: yes
|
||||
|
||||
strip: //div[@id='social_sharing']
|
||||
strip: //div[@class='socialLinks']
|
||||
title: //h2[@class='border']
|
||||
body: //div[@class='padding']
|
||||
|
||||
convert_double_br_tags: yes
|
||||
|
||||
strip: //div[@id='social_sharing']
|
||||
strip: //div[@class='socialLinks']
|
||||
|
||||
test_url: http://www.944.com/articles/mild-obsessions-frock-la-get-to-know-victoria-tik-s-haute-sustainable-fashion-line/
|
38
inc/3rdparty/site_config/standard/README.md
vendored
Executable file
38
inc/3rdparty/site_config/standard/README.md
vendored
Executable file
@ -0,0 +1,38 @@
|
||||
Full-Text RSS site config files
|
||||
================
|
||||
|
||||
[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no site patterns, it tries to detect the content block automatically.
|
||||
|
||||
This repository contains the site config files we use in Full-Text RSS.
|
||||
|
||||
### Contributing changes
|
||||
|
||||
We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface.
|
||||
|
||||
You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model:
|
||||
|
||||
> The Fork & Pull Model lets anyone fork an existing repository and push changes to their personal fork without requiring access be granted to the source repository. The changes must then be pulled into the source repository by the project maintainer. This model reduces the amount of friction for new contributors and is popular with open source projects because it allows people to work independently without upfront coordination.
|
||||
|
||||
When we receive a pull request we'll review the changes and if everything's okay we'll update our copy.
|
||||
|
||||
If a site is not in our set, you can create a file for it in the same way. See [Creating files on GitHub](https://github.com/blog/1327-creating-files-on-github).
|
||||
|
||||
### How to write a site config file
|
||||
|
||||
The quickest and simplest way is to use our [point-and-click interface](http://siteconfig.fivefilters.org). It's a simple tool only intended to create a rule to extract the correct content block.
|
||||
|
||||
For further refinements, e.g. selecting the title, stripping elements, dealing with multi-page articles, please see our [help page](http://help.fivefilters.org/customer/portal/articles/223153-site-patterns).
|
||||
|
||||
### Instapaper
|
||||
|
||||
When we introduced site patterns, we chose to adopt the [same format](http://blog.instapaper.com/post/730281947) used by Instapaper. This allows us to make use of the existing extraction rules contributed by Instapaper users.
|
||||
|
||||
Marco, Instapaper's creator, graciously opened up the database of contributions to everyone:
|
||||
|
||||
> And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached.
|
||||
|
||||
Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (login required).
|
||||
|
||||
### Testing site config files
|
||||
|
||||
Currently you will have to have a copy of Full-Text RSS to test changes to the site config files. In the future we will try to make this process easier.
|
18
inc/3rdparty/site_config/standard/aachener-nachrichten.de.txt
vendored
Normal file → Executable file
18
inc/3rdparty/site_config/standard/aachener-nachrichten.de.txt
vendored
Normal file → Executable file
@ -1,10 +1,10 @@
|
||||
title: //meta[@property='og:title']/@content
|
||||
body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
|
||||
|
||||
strip_id_or_class: socialshareprivacy1
|
||||
strip_id_or_class: zvaFacebookButton
|
||||
|
||||
tidy: no
|
||||
prune: no
|
||||
|
||||
title: //meta[@property='og:title']/@content
|
||||
body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
|
||||
|
||||
strip_id_or_class: socialshareprivacy1
|
||||
strip_id_or_class: zvaFacebookButton
|
||||
|
||||
tidy: no
|
||||
prune: no
|
||||
|
||||
test_url: http://www.aachener-nachrichten.de/lokales/aachen-detail-an/2517757
|
18
inc/3rdparty/site_config/standard/aachener-zeitung.de.txt
vendored
Normal file → Executable file
18
inc/3rdparty/site_config/standard/aachener-zeitung.de.txt
vendored
Normal file → Executable file
@ -1,10 +1,10 @@
|
||||
title: //meta[@property='og:title']/@content
|
||||
body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
|
||||
|
||||
strip_id_or_class: socialshareprivacy1
|
||||
strip_id_or_class: zvaFacebookButton
|
||||
|
||||
tidy: no
|
||||
prune: no
|
||||
|
||||
title: //meta[@property='og:title']/@content
|
||||
body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
|
||||
|
||||
strip_id_or_class: socialshareprivacy1
|
||||
strip_id_or_class: zvaFacebookButton
|
||||
|
||||
tidy: no
|
||||
prune: no
|
||||
|
||||
test_url: http://www.aachener-zeitung.de/sixcms/detail.php?template=az_detail&id=2552718
|
10
inc/3rdparty/site_config/standard/abc.es.txt
vendored
Normal file → Executable file
10
inc/3rdparty/site_config/standard/abc.es.txt
vendored
Normal file → Executable file
@ -1,7 +1,7 @@
|
||||
title: //meta[@property='og:title']/@content
|
||||
body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text']
|
||||
strip_id_or_class: colB
|
||||
|
||||
prune: no
|
||||
title: //meta[@property='og:title']/@content
|
||||
body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text' or @itemprop='articleBody']
|
||||
strip_id_or_class: colB
|
||||
|
||||
prune: no
|
||||
|
||||
test_url: http://www.abc.es/20120209/tv-series/abci-house-ultima-temporada-201202090936.html
|
26
inc/3rdparty/site_config/standard/abc.net.au.txt
vendored
Normal file → Executable file
26
inc/3rdparty/site_config/standard/abc.net.au.txt
vendored
Normal file → Executable file
@ -1,10 +1,18 @@
|
||||
title: //h1
|
||||
author: //div[@class="byline"]/a
|
||||
date: //span[@class="timestamp"]
|
||||
|
||||
strip: //p[@class="topics"]
|
||||
strip: //h1
|
||||
strip: //div[@class="byline"]
|
||||
strip: //p[@class="published"]
|
||||
title: //div[@class='article section']//h1
|
||||
author: //div[@class="byline"]/a
|
||||
date: //span[@class="timestamp"]
|
||||
body: //div[@class="page section"]
|
||||
|
||||
strip: //a[@class="inline-caption"]
|
||||
strip: //p[@class="ticker section noprint"]
|
||||
strip: //p[@class="topics"]
|
||||
strip: //h1
|
||||
strip: //div[@class="byline"]
|
||||
strip: //p[@class="published"]
|
||||
strip: //div[contains(@class,"featured-scroller")]
|
||||
test_url: http://www.abc.net.au/news/2011-11-08/crabb-carbon-legislation-abbott-demolition/3652544
|
||||
strip_id_or_class: footer
|
||||
|
||||
tidy: no
|
||||
|
||||
test_url: http://www.abc.net.au/news/2013-03-27/open-speed-highways-change-clp-giles/4597892
|
||||
test_url: http://www.abc.net.au/news/2013-04-30/credit-growth-remains-subdued/4660054?section=business
|
||||
|
52
inc/3rdparty/site_config/standard/abcnews.go.com.txt
vendored
Normal file → Executable file
52
inc/3rdparty/site_config/standard/abcnews.go.com.txt
vendored
Normal file → Executable file
@ -1,27 +1,27 @@
|
||||
title: //h1[@class='headline']
|
||||
body: //div[@id='storyText']
|
||||
# for video entries
|
||||
body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')]
|
||||
author: //div[@class='byline']
|
||||
date: //div[@class='date']
|
||||
strip: //*[@id='date_partner']
|
||||
|
||||
strip: //div[@class='breadcrumb']
|
||||
strip: //div[contains(@class,'show_tools')]
|
||||
strip: //div[@id='sponsoredByAd']
|
||||
strip: //div[contains(@class,'rel_container')]
|
||||
strip: //p[a[starts-with(@href, 'http://www.twitter.com')]]
|
||||
strip: //p[a[starts-with(@href, 'http://www.facebook.com')]]
|
||||
strip: //p[contains(., 'Click here to return to')]
|
||||
#strip_id_or_class: media
|
||||
strip_id_or_class: mediaplayer
|
||||
|
||||
replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http
|
||||
|
||||
prune: no
|
||||
|
||||
single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true')
|
||||
|
||||
test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744
|
||||
# multi-page
|
||||
title: //h1[@class='headline']
|
||||
body: //div[@id='storyText']
|
||||
# for video entries
|
||||
body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')]
|
||||
author: //div[@class='byline']
|
||||
date: //div[@class='date']
|
||||
strip: //*[@id='date_partner']
|
||||
|
||||
strip: //div[@class='breadcrumb']
|
||||
strip: //div[contains(@class,'show_tools')]
|
||||
strip: //div[@id='sponsoredByAd']
|
||||
strip: //div[contains(@class,'rel_container')]
|
||||
strip: //p[a[starts-with(@href, 'http://www.twitter.com')]]
|
||||
strip: //p[a[starts-with(@href, 'http://www.facebook.com')]]
|
||||
strip: //p[contains(., 'Click here to return to')]
|
||||
#strip_id_or_class: media
|
||||
strip_id_or_class: mediaplayer
|
||||
|
||||
replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http
|
||||
|
||||
prune: no
|
||||
|
||||
single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true')
|
||||
|
||||
test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744
|
||||
# multi-page
|
||||
test_url: http://abcnews.go.com/Blotter/family-freed-american-hostage-somalia-seals-obama/story?id=15439544
|
16
inc/3rdparty/site_config/standard/accesstoinsight.org.txt
vendored
Normal file → Executable file
16
inc/3rdparty/site_config/standard/accesstoinsight.org.txt
vendored
Normal file → Executable file
@ -1,9 +1,9 @@
|
||||
title: //div[@id='H_docTitle']
|
||||
|
||||
body: //div[@id='H_meta' or @id='H_content' or @id='F_footer']
|
||||
|
||||
strip_id_or_class: F_toenail
|
||||
|
||||
prune: no
|
||||
|
||||
title: //div[@id='H_docTitle']
|
||||
|
||||
body: //div[@id='H_meta' or @id='H_content' or @id='F_footer']
|
||||
|
||||
strip_id_or_class: F_toenail
|
||||
|
||||
prune: no
|
||||
|
||||
test_url: http://www.accesstoinsight.org/lib/authors/nyanaponika/wheel026.html
|
4
inc/3rdparty/site_config/standard/acidcow.com.txt
vendored
Normal file → Executable file
4
inc/3rdparty/site_config/standard/acidcow.com.txt
vendored
Normal file → Executable file
@ -1,3 +1,3 @@
|
||||
body: //div[starts-with(@id, 'news-id-')]
|
||||
|
||||
body: //div[starts-with(@id, 'news-id-')]
|
||||
|
||||
test_url: http://acidcow.com/fun/20933-acid-picdump-83-pics.html
|
14
inc/3rdparty/site_config/standard/acquia.com.txt
vendored
Normal file → Executable file
14
inc/3rdparty/site_config/standard/acquia.com.txt
vendored
Normal file → Executable file
@ -1,9 +1,9 @@
|
||||
title://h1[@class="title"]
|
||||
author://div[@class="submitted"]/span/a
|
||||
date://div[@class="submitted"]/span
|
||||
body://div[@class="content-wrapper"]
|
||||
|
||||
strip://div[@id="skip-link"]
|
||||
strip://div[@id="region-content-3-3"]
|
||||
title://h1[@class="title"]
|
||||
author://div[@class="submitted"]/span/a
|
||||
date://div[@class="submitted"]/span
|
||||
body://div[@class="content-wrapper"]
|
||||
|
||||
strip://div[@id="skip-link"]
|
||||
strip://div[@id="region-content-3-3"]
|
||||
strip://div[@id="section-footer"]
|
||||
test_url: https://www.acquia.com/blog/drupals-long-warmth-toward-third-party-code
|
6
inc/3rdparty/site_config/standard/acroswing.fr.txt
vendored
Normal file → Executable file
6
inc/3rdparty/site_config/standard/acroswing.fr.txt
vendored
Normal file → Executable file
@ -1,5 +1,5 @@
|
||||
tidy:no
|
||||
date: //time[@class='updated']
|
||||
dissolve: //ul[@class='video-gallery']/li
|
||||
tidy:no
|
||||
date: //time[@class='updated']
|
||||
dissolve: //ul[@class='video-gallery']/li
|
||||
dissolve: //ul[@class='video-gallery']
|
||||
test_url: http://www.acroswing.fr/actualites/competition_rock/selectif_bellegarde_sur_valserine__2012-02-26.php
|
5
inc/3rdparty/site_config/standard/aftenposten.no.txt
vendored
Executable file
5
inc/3rdparty/site_config/standard/aftenposten.no.txt
vendored
Executable file
@ -0,0 +1,5 @@
|
||||
title: //h1[@class='articleTitle ']
|
||||
body: //div[@class='bodyText widget storyContent']
|
||||
strip: //p/span[@class='quote']/..
|
||||
strip_id_or_class: 'pull1'
|
||||
test_url: https://www.aftenposten.no/meninger/spaltister/Portrett-av-scenekunstneren-som-ung-mann-7167959.html
|
13
inc/3rdparty/site_config/standard/aftonbladet.se.txt
vendored
Executable file
13
inc/3rdparty/site_config/standard/aftonbladet.se.txt
vendored
Executable file
@ -0,0 +1,13 @@
|
||||
author: //article//address[contains(@class, 'author')]
|
||||
body: //article[.//div[contains(@class, 'abBodyText')]]//*[contains(@class, 'abLeadText') or contains(@class, 'abBodyText') or contains(@class, 'abImageBlock') or contains(@class, 'abIGSatellite')]
|
||||
|
||||
strip: //address//img
|
||||
strip: //footer
|
||||
strip_id_or_class: abSticky
|
||||
|
||||
prune: no
|
||||
|
||||
test_url: http://www.aftonbladet.se/sportbladet/hockey/sverige/allsvenskan/article17498194.ab
|
||||
test_url: http://www.aftonbladet.se/debatt/article16207536.ab
|
||||
test_url: http://www.aftonbladet.se/debatt/debattamnen/politik/article17483377.ab
|
||||
test_url: http://www.aftonbladet.se/rss.xml
|
26
inc/3rdparty/site_config/standard/aht.seriouseats.com.txt
vendored
Normal file → Executable file
26
inc/3rdparty/site_config/standard/aht.seriouseats.com.txt
vendored
Normal file → Executable file
@ -1,15 +1,15 @@
|
||||
body: //div[@id='content']
|
||||
|
||||
# clean up recipe pages
|
||||
strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']
|
||||
|
||||
#recipe pages
|
||||
strip_id_or_class: "recipe-feedback"
|
||||
strip_id_or_class: "comments"
|
||||
strip_id_or_class: "procedure-number"
|
||||
strip_id_or_class: "more-with-author"
|
||||
|
||||
#slice
|
||||
strip_id_or_class: "inner"
|
||||
body: //div[@id='content']
|
||||
|
||||
# clean up recipe pages
|
||||
strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']
|
||||
|
||||
#recipe pages
|
||||
strip_id_or_class: "recipe-feedback"
|
||||
strip_id_or_class: "comments"
|
||||
strip_id_or_class: "procedure-number"
|
||||
strip_id_or_class: "more-with-author"
|
||||
|
||||
#slice
|
||||
strip_id_or_class: "inner"
|
||||
|
||||
test_url: http://aht.seriouseats.com/archives/2009/12/the-burger-lab-salting-ground-beef.html
|
6
inc/3rdparty/site_config/standard/albayan.ae.txt
vendored
Executable file
6
inc/3rdparty/site_config/standard/albayan.ae.txt
vendored
Executable file
@ -0,0 +1,6 @@
|
||||
body: //div[@id='main-column']//div[@class='content']
|
||||
|
||||
prune: no
|
||||
|
||||
test_url: http://www.albayan.ae/across-the-uae/education/2013-08-29-1.1949645
|
||||
test_url: http://www.albayan.ae/1.448?ot=ot.AjaxPageLayout
|
0
inc/3rdparty/site_config/standard/alex.mullr.net.txt
vendored
Normal file → Executable file
0
inc/3rdparty/site_config/standard/alex.mullr.net.txt
vendored
Normal file → Executable file
4
inc/3rdparty/site_config/standard/alexduner.com.txt
vendored
Executable file
4
inc/3rdparty/site_config/standard/alexduner.com.txt
vendored
Executable file
@ -0,0 +1,4 @@
|
||||
body: //section[@class='content']
|
||||
date: //span[1]
|
||||
author: //h1[@id='sitetitle']
|
||||
test_url: https://alexduner.com/blog/2013/1/something-i-learned-today
|
4
inc/3rdparty/site_config/standard/alexduner.squarespace.com.txt
vendored
Executable file
4
inc/3rdparty/site_config/standard/alexduner.squarespace.com.txt
vendored
Executable file
@ -0,0 +1,4 @@
|
||||
body: //section[@class='content']
|
||||
date: //span[1]
|
||||
author: //h1[@id='sitetitle']
|
||||
test_url: https://alexduner.squarespace.com/blog/2013/1/tech-culture-from-the-outside-looking-in
|
20
inc/3rdparty/site_config/standard/alistapart.com.txt
vendored
Normal file → Executable file
20
inc/3rdparty/site_config/standard/alistapart.com.txt
vendored
Normal file → Executable file
@ -1,12 +1,12 @@
|
||||
title: //h1[@class='title']
|
||||
author: //h3[@class='byline']/a
|
||||
date: //div[@class='ishinfo']
|
||||
|
||||
body: //*[@id='articletext']
|
||||
strip_id_or_class: 'ishinfo'
|
||||
strip_id_or_class: 'metastuff'
|
||||
strip_id_or_class: 'learnmore'
|
||||
strip_id_or_class: 'discuss'
|
||||
|
||||
title: //h1[@class='title']
|
||||
author: //h3[@class='byline']/a
|
||||
date: //div[@class='ishinfo']
|
||||
|
||||
body: //*[@id='articletext']
|
||||
strip_id_or_class: 'ishinfo'
|
||||
strip_id_or_class: 'metastuff'
|
||||
strip_id_or_class: 'learnmore'
|
||||
strip_id_or_class: 'discuss'
|
||||
|
||||
prune: no
|
||||
test_url: http://www.alistapart.com/articles/organizing-mobile/
|
14
inc/3rdparty/site_config/standard/aljazeera.com.txt
vendored
Normal file → Executable file
14
inc/3rdparty/site_config/standard/aljazeera.com.txt
vendored
Normal file → Executable file
@ -1,8 +1,8 @@
|
||||
title: //span[@id='DetailedTitle']
|
||||
body: //td[@id='tdTextContent']
|
||||
strip_id_or_class: Skyscrapper_Body
|
||||
date: //span[@id='ctl00_cphBody_lblDate']
|
||||
author: //div[@id="dvAuthorInfo"]//a/text()
|
||||
strip: //table[ tbody/tr/td/object ]
|
||||
prune: no
|
||||
title: //span[@id='DetailedTitle']
|
||||
body: //td[@id='tdTextContent']
|
||||
strip_id_or_class: Skyscrapper_Body
|
||||
date: //span[@id='ctl00_cphBody_lblDate']
|
||||
author: //div[@id="dvAuthorInfo"]//a/text()
|
||||
strip: //table[ tbody/tr/td/object ]
|
||||
prune: no
|
||||
test_url: http://www.aljazeera.com/indepth/opinion/2012/01/2012114121925380575.html
|
24
inc/3rdparty/site_config/standard/allrecipes.com.txt
vendored
Normal file → Executable file
24
inc/3rdparty/site_config/standard/allrecipes.com.txt
vendored
Normal file → Executable file
@ -1,14 +1,14 @@
|
||||
title: //h1[@id='itemTitle']
|
||||
body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')]
|
||||
strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right']
|
||||
strip: //div[contains(@class, 'rightcoltoolsdiv')]
|
||||
strip: //div[contains(@class, 'servings-form')]
|
||||
strip: //p[@class='nutritional-information']
|
||||
strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')]
|
||||
strip: //div[@id='nutri-info']/div[contains(@class, 'title')]
|
||||
strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter']
|
||||
strip_id_or_class: eshaAttribute
|
||||
strip_id_or_class: eshaParagraph
|
||||
prune: no
|
||||
title: //h1[@id='itemTitle']
|
||||
body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')]
|
||||
strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right']
|
||||
strip: //div[contains(@class, 'rightcoltoolsdiv')]
|
||||
strip: //div[contains(@class, 'servings-form')]
|
||||
strip: //p[@class='nutritional-information']
|
||||
strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')]
|
||||
strip: //div[@id='nutri-info']/div[contains(@class, 'title')]
|
||||
strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter']
|
||||
strip_id_or_class: eshaAttribute
|
||||
strip_id_or_class: eshaParagraph
|
||||
prune: no
|
||||
|
||||
test_url: http://allrecipes.com/Recipe/Taco-Pie/Detail.aspx?src=rotd
|
21
inc/3rdparty/site_config/standard/allthingsd.com.txt
vendored
Normal file → Executable file
21
inc/3rdparty/site_config/standard/allthingsd.com.txt
vendored
Normal file → Executable file
@ -1,10 +1,13 @@
|
||||
title://div[@class="article-title"]/h1[@class="title"]
|
||||
date: //p[@class="article-date"]
|
||||
body://*[@class="article-body article-text"]
|
||||
# Trim out related posts at bottom of article
|
||||
strip://blockquote[@class="memo"]
|
||||
|
||||
# Yup, no idea why author won't work...
|
||||
author://div[@class="page-header article-header clearfix"]/p[@class="title"]
|
||||
title://div[@class="article-title"]/h1[@class="title"]
|
||||
date: //p[@class="article-date"]
|
||||
body://div[contains(@class, "article-body")]
|
||||
# Trim out related posts at bottom of article
|
||||
strip://blockquote[@class="memo"]
|
||||
|
||||
tidy: no
|
||||
|
||||
# Yup, no idea why author won't work...
|
||||
author://div[@class="page-header article-header clearfix"]/p[@class="title"]
|
||||
# [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it.
|
||||
test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/
|
||||
test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/
|
||||
test_url: http://allthingsd.com/20131010/google-cio-ben-fried-on-how-google-works/
|
12
inc/3rdparty/site_config/standard/allyou.com.txt
vendored
Normal file → Executable file
12
inc/3rdparty/site_config/standard/allyou.com.txt
vendored
Normal file → Executable file
@ -1,8 +1,8 @@
|
||||
title: //div[@id='pageHdr']//h1
|
||||
body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint']
|
||||
strip: //div[contains(@class, 'infoBox') or @id='infoBox']
|
||||
single_page_link: //li[@id='print']/a
|
||||
|
||||
title: //div[@id='pageHdr']//h1
|
||||
body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint']
|
||||
strip: //div[contains(@class, 'infoBox') or @id='infoBox']
|
||||
single_page_link: //li[@id='print']/a
|
||||
|
||||
prune: no
|
||||
|
||||
|
||||
test_url: http://www.allyou.com/budget-home/money-shopping/freebies-online-00400000066392/
|
18
inc/3rdparty/site_config/standard/alphabeta.argaam.com.txt
vendored
Normal file → Executable file
18
inc/3rdparty/site_config/standard/alphabeta.argaam.com.txt
vendored
Normal file → Executable file
@ -1,11 +1,11 @@
|
||||
body: //div[@class = 'entry']
|
||||
date: substring-after(//p[@class="date"],'بتاريخ ')
|
||||
strip_id_or_class: date
|
||||
strip_id_or_class: follow-single
|
||||
strip_id_or_class: ratingblock
|
||||
strip_id_or_class: newRatingHolder
|
||||
strip_id_or_class: postmetadata
|
||||
strip_id_or_class: addthis_toolbox
|
||||
strip_id_or_class: addthis_default_style
|
||||
body: //div[@class = 'entry']
|
||||
date: substring-after(//p[@class="date"],'بتاريخ ')
|
||||
strip_id_or_class: date
|
||||
strip_id_or_class: follow-single
|
||||
strip_id_or_class: ratingblock
|
||||
strip_id_or_class: newRatingHolder
|
||||
strip_id_or_class: postmetadata
|
||||
strip_id_or_class: addthis_toolbox
|
||||
strip_id_or_class: addthis_default_style
|
||||
strip_id_or_class: size-full
|
||||
test_url: http://alphabeta.argaam.com/?p=35657
|
16
inc/3rdparty/site_config/standard/alriyadh.com.txt
vendored
Normal file → Executable file
16
inc/3rdparty/site_config/standard/alriyadh.com.txt
vendored
Normal file → Executable file
@ -1,9 +1,9 @@
|
||||
body: //div[@id = "article-view"]
|
||||
body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')]
|
||||
author: //p[@class = "author"]
|
||||
strip: //h1
|
||||
strip: //h2
|
||||
strip_id_or_class: author
|
||||
prune: no
|
||||
test_url: http://www.alriyadh.com/2011/10/10/article674357.html
|
||||
body: //div[@id = "article-view"]
|
||||
body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')]
|
||||
author: //p[@class = "author"]
|
||||
strip: //h1
|
||||
strip: //h2
|
||||
strip_id_or_class: author
|
||||
prune: no
|
||||
test_url: http://www.alriyadh.com/2011/10/10/article674357.html
|
||||
test_url: http://www.alriyadh.com/net/article/780935
|
0
inc/3rdparty/site_config/standard/alseraj.net.txt
vendored
Normal file → Executable file
0
inc/3rdparty/site_config/standard/alseraj.net.txt
vendored
Normal file → Executable file
0
inc/3rdparty/site_config/standard/alt1040.com.txt
vendored
Normal file → Executable file
0
inc/3rdparty/site_config/standard/alt1040.com.txt
vendored
Normal file → Executable file
4
inc/3rdparty/site_config/standard/alternet.org.txt
vendored
Executable file
4
inc/3rdparty/site_config/standard/alternet.org.txt
vendored
Executable file
@ -0,0 +1,4 @@
|
||||
single_page_link: //div[contains(@class, 'story_tools')]//a[contains(@href, '/print/')]
|
||||
|
||||
test_url: http://www.alternet.org/civil-liberties/noam-chomsky-surveillance-state-beyond-imagination-being-created-one-freest
|
||||
test_url: http://feeds.feedblitz.com/alternet
|
0
inc/3rdparty/site_config/standard/altfoto.com.txt
vendored
Normal file → Executable file
0
inc/3rdparty/site_config/standard/altfoto.com.txt
vendored
Normal file → Executable file
16
inc/3rdparty/site_config/standard/alumni.stanford.edu.txt
vendored
Normal file → Executable file
16
inc/3rdparty/site_config/standard/alumni.stanford.edu.txt
vendored
Normal file → Executable file
@ -1,10 +1,10 @@
|
||||
title: //h1
|
||||
|
||||
author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ")
|
||||
|
||||
date: //div/a[contains (@href, "issue")]
|
||||
|
||||
move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1]
|
||||
|
||||
title: //h1
|
||||
|
||||
author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ")
|
||||
|
||||
date: //div/a[contains (@href, "issue")]
|
||||
|
||||
move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1]
|
||||
|
||||
body: //div[@class="enableBullets"]
|
||||
test_url: http://alumni.stanford.edu/get/page/magazine/article/?article_id=54819
|
6
inc/3rdparty/site_config/standard/amandala.com.bz.txt
vendored
Executable file
6
inc/3rdparty/site_config/standard/amandala.com.bz.txt
vendored
Executable file
@ -0,0 +1,6 @@
|
||||
body: //div[@id='content']//div[contains(@class, 'content')]
|
||||
strip_id_or_class: widget
|
||||
strip: //a[contains(@href, 'upm_export=')]
|
||||
|
||||
test_url: http://amandala.com.bz/news/feed/
|
||||
test_url: http://amandala.com.bz/news/poor-pse-results-30-raise/
|
36
inc/3rdparty/site_config/standard/amazon.com.txt
vendored
Normal file → Executable file
36
inc/3rdparty/site_config/standard/amazon.com.txt
vendored
Normal file → Executable file
@ -1,19 +1,19 @@
|
||||
title: //span[@id = 'btAsinTitle']
|
||||
body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div
|
||||
#strip_id_or_class: quantityDropdownDiv
|
||||
#strip_id_or_class: addToCartSpan
|
||||
#strip_id_or_class: oneClickDiv
|
||||
strip_id_or_class: nocontent
|
||||
strip_id_or_class: masDynamicConten
|
||||
strip_id_or_class: dynamic-content
|
||||
prune: no
|
||||
|
||||
find_string: <span id="actualPriceValue">
|
||||
replace_string: <span id="actualPriceValue"><br />Price:
|
||||
|
||||
strip_id_or_class: collapsePS
|
||||
strip_id_or_class: expandPS
|
||||
strip_id_or_class: psPlaceHolde
|
||||
strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')]
|
||||
|
||||
title: //span[@id = 'btAsinTitle']
|
||||
body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div
|
||||
#strip_id_or_class: quantityDropdownDiv
|
||||
#strip_id_or_class: addToCartSpan
|
||||
#strip_id_or_class: oneClickDiv
|
||||
strip_id_or_class: nocontent
|
||||
strip_id_or_class: masDynamicConten
|
||||
strip_id_or_class: dynamic-content
|
||||
prune: no
|
||||
|
||||
find_string: <span id="actualPriceValue">
|
||||
replace_string: <span id="actualPriceValue"><br />Price:
|
||||
|
||||
strip_id_or_class: collapsePS
|
||||
strip_id_or_class: expandPS
|
||||
strip_id_or_class: psPlaceHolde
|
||||
strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')]
|
||||
|
||||
test_url: http://www.amazon.com/Common-Sense-Forestry-Living-Mother/dp/1931498210/
|
8
inc/3rdparty/site_config/standard/americandrink.net.txt
vendored
Normal file → Executable file
8
inc/3rdparty/site_config/standard/americandrink.net.txt
vendored
Normal file → Executable file
@ -1,6 +1,6 @@
|
||||
title: //div[@class='head']/h2/a
|
||||
author: //div[@class='head']/a
|
||||
date: //div[@class='head']/p[@class='date']/a
|
||||
body: //div[@class='copy']
|
||||
title: //div[@class='head']/h2/a
|
||||
author: //div[@class='head']/a
|
||||
date: //div[@class='head']/p[@class='date']/a
|
||||
body: //div[@class='copy']
|
||||
strip: //p[@class='meta']
|
||||
test_url: http://americandrink.net/post/10567188712/free-the-hooch
|
18
inc/3rdparty/site_config/standard/americascup.com.txt
vendored
Normal file → Executable file
18
inc/3rdparty/site_config/standard/americascup.com.txt
vendored
Normal file → Executable file
@ -1,10 +1,10 @@
|
||||
title: //div[@class="editorial-content"]/h3
|
||||
body: //div[@class="hero-image" or @class="editorial-content"]
|
||||
|
||||
strip: //ul[@class="hero-caption"]
|
||||
strip_id_or_class: footer
|
||||
|
||||
prune: no
|
||||
tidy: no
|
||||
|
||||
title: //div[@class="editorial-content"]/h3
|
||||
body: //div[@class="hero-image" or @class="editorial-content"]
|
||||
|
||||
strip: //ul[@class="hero-caption"]
|
||||
strip_id_or_class: footer
|
||||
|
||||
prune: no
|
||||
tidy: no
|
||||
|
||||
test_url: http://www.americascup.com/en/Latest/News/2012/3/Coutts-and-Peyron-tell-transformative-tale-at-Global-Sports-Forum/
|
6
inc/3rdparty/site_config/standard/americastestkitchenfeed.com.txt
vendored
Normal file → Executable file
6
inc/3rdparty/site_config/standard/americastestkitchenfeed.com.txt
vendored
Normal file → Executable file
@ -1,5 +1,5 @@
|
||||
title: //h1[@class="post-title"]
|
||||
author: //span[@class="author"]/a
|
||||
date: //span[@class="date"]
|
||||
title: //h1[@class="post-title"]
|
||||
author: //span[@class="author"]/a
|
||||
date: //span[@class="date"]
|
||||
body: //div[@class="post-content main"]
|
||||
test_url: http://www.americastestkitchenfeed.com/gadgets-and-gear/2012/07/chill-out-with-tovolos-king-cube-silicone-ice-cube-tray/
|
8
inc/3rdparty/site_config/standard/amptoons.com.txt
vendored
Executable file
8
inc/3rdparty/site_config/standard/amptoons.com.txt
vendored
Executable file
@ -0,0 +1,8 @@
|
||||
title: //title
|
||||
|
||||
body: //div[@class="entry-content"]
|
||||
|
||||
author: //span[@class="author vcard"]
|
||||
|
||||
date: //span[@class="entry-date"]
|
||||
test_url: http://www.amptoons.com/blog/2013/03/14/open-thread-and-link-farm-i-hate-being-sick-edition/
|
20
inc/3rdparty/site_config/standard/anandtech.com.txt
vendored
Normal file → Executable file
20
inc/3rdparty/site_config/standard/anandtech.com.txt
vendored
Normal file → Executable file
@ -1,11 +1,11 @@
|
||||
author: //a[@class='b'][1]
|
||||
date: substring-after(substring-before(//div, 'Posted in'), ' on ')
|
||||
strip_image_src: /content/images/globals/
|
||||
strip: //h2[. = 'Page 1']/preceding::p
|
||||
strip: //h2
|
||||
|
||||
prune: no
|
||||
|
||||
single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/'))
|
||||
|
||||
author: //a[@class='b'][1]
|
||||
date: substring-after(substring-before(//div, 'Posted in'), ' on ')
|
||||
strip_image_src: /content/images/globals/
|
||||
strip: //h2[. = 'Page 1']/preceding::p
|
||||
strip: //h2
|
||||
|
||||
prune: no
|
||||
|
||||
single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/'))
|
||||
|
||||
test_url: http://www.anandtech.com/show/5812/eurocom-monster-10-clevos-little-monster/
|
5
inc/3rdparty/site_config/standard/androidpolice.com.txt
vendored
Executable file
5
inc/3rdparty/site_config/standard/androidpolice.com.txt
vendored
Executable file
@ -0,0 +1,5 @@
|
||||
body: //div[@class='post_content']
|
||||
date: //div[@class='date_day'] | div[@class='date_month']
|
||||
|
||||
test_url: http://www.androidpolice.com/2014/03/30/music-boss-for-pebble-can-now-control-playback-and-volume-on-chromecast-content-from-your-smartwatch/
|
||||
|
16
inc/3rdparty/site_config/standard/andyrutledge.com.txt
vendored
Normal file → Executable file
16
inc/3rdparty/site_config/standard/andyrutledge.com.txt
vendored
Normal file → Executable file
@ -1,9 +1,9 @@
|
||||
title: //h2
|
||||
author: string('Andy Rutledge')
|
||||
date: //div[@class='articledate']
|
||||
body: //div[@class='copybody']
|
||||
|
||||
strip: //*[@class='space']
|
||||
strip: //*[@class='articleFoot']
|
||||
|
||||
title: //h2
|
||||
author: string('Andy Rutledge')
|
||||
date: //div[@class='articledate']
|
||||
body: //div[@class='copybody']
|
||||
|
||||
strip: //*[@class='space']
|
||||
strip: //*[@class='articleFoot']
|
||||
|
||||
test_url: http://www.andyrutledge.com/hungry-for-a-better-menu.php
|
14
inc/3rdparty/site_config/standard/annatravelling.wordpress.com.txt
vendored
Normal file → Executable file
14
inc/3rdparty/site_config/standard/annatravelling.wordpress.com.txt
vendored
Normal file → Executable file
@ -1,9 +1,9 @@
|
||||
title: //h1[@class="title"]
|
||||
|
||||
author: ("Anna Manasova")
|
||||
# is ignored, unfortunately
|
||||
|
||||
date: //p[@class="date"]
|
||||
|
||||
title: //h1[@class="title"]
|
||||
|
||||
author: ("Anna Manasova")
|
||||
# is ignored, unfortunately
|
||||
|
||||
date: //p[@class="date"]
|
||||
|
||||
body: //div[@class="entry"]
|
||||
test_url: http://annatravelling.wordpress.com/2011/11/07/a-day-of-cooking-thai/
|
34
inc/3rdparty/site_config/standard/applature.com.txt
vendored
Normal file → Executable file
34
inc/3rdparty/site_config/standard/applature.com.txt
vendored
Normal file → Executable file
@ -1,18 +1,18 @@
|
||||
title: //h1[contains(@class, 'title')#
|
||||
body: //div[@id='mainContent']//div[contains(@class, 'section_content')] | //ul[@class='section_footer']
|
||||
date: //div[@class='date']
|
||||
|
||||
strip_id_or_class: sharethis
|
||||
strip_id_or_class: stats
|
||||
strip_id_or_class: apply_form
|
||||
strip_id_or_class: job_map
|
||||
strip_id_or_class: respond
|
||||
strip: //h1//span[@class='type']
|
||||
strip: //li[@class='print' or @class='map']
|
||||
|
||||
replace_string(<ul class="section_footer" style="display): <ul class="section_footer" style="display-bla
|
||||
|
||||
prune: no
|
||||
tidy: no
|
||||
|
||||
title: //h1[contains(@class, 'title')#
|
||||
body: //div[@id='mainContent']//div[contains(@class, 'section_content')] | //ul[@class='section_footer']
|
||||
date: //div[@class='date']
|
||||
|
||||
strip_id_or_class: sharethis
|
||||
strip_id_or_class: stats
|
||||
strip_id_or_class: apply_form
|
||||
strip_id_or_class: job_map
|
||||
strip_id_or_class: respond
|
||||
strip: //h1//span[@class='type']
|
||||
strip: //li[@class='print' or @class='map']
|
||||
|
||||
replace_string(<ul class="section_footer" style="display): <ul class="section_footer" style="display-bla
|
||||
|
||||
prune: no
|
||||
tidy: no
|
||||
|
||||
test_url: http://applature.com/mining-jobs/jobs/nickel-west-leinster-analytical-laboratory-technician/
|
12
inc/3rdparty/site_config/standard/apple.com.txt
vendored
Normal file → Executable file
12
inc/3rdparty/site_config/standard/apple.com.txt
vendored
Normal file → Executable file
@ -1,7 +1,7 @@
|
||||
strip: //p[@class='sosumi']
|
||||
# Aren't they witty?
|
||||
|
||||
# I can't work out what causes the before the title.
|
||||
title: //h1[@class='title']
|
||||
strip: //h1[@class='title']
|
||||
strip: //p[@class='sosumi']
|
||||
# Aren't they witty?
|
||||
|
||||
# I can't work out what causes the before the title.
|
||||
title: //h1[@class='title']
|
||||
strip: //h1[@class='title']
|
||||
test_url: http://www.apple.com/pr/library/2011/02/15appstore.html
|
4
inc/3rdparty/site_config/standard/appledaily.com.tw.txt
vendored
Executable file
4
inc/3rdparty/site_config/standard/appledaily.com.tw.txt
vendored
Executable file
@ -0,0 +1,4 @@
|
||||
body: //div[contains(@class, 'articulum')]
|
||||
|
||||
test_url: http://www.appledaily.com.tw/realtimenews/article/new/20140120/330479
|
||||
test_url: http://www.appledaily.com.tw/rss/create/kind/rnews/type/new/
|
34
inc/3rdparty/site_config/standard/appleinsider.com.txt
vendored
Normal file → Executable file
34
inc/3rdparty/site_config/standard/appleinsider.com.txt
vendored
Normal file → Executable file
@ -1,11 +1,23 @@
|
||||
title: //p[@class='title']
|
||||
|
||||
author: //p[text() = 'By ']/a/text()
|
||||
strip: //p[text() = 'By ']
|
||||
|
||||
body: //td[@class='bod']
|
||||
strip_id_or_class: title
|
||||
strip_id_or_class: minor
|
||||
|
||||
strip_id_or_class: multipagefooter
|
||||
test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html
|
||||
title: //h1[@class="art-head"]
|
||||
|
||||
author: //p[contains(@class, 'byline')]/a
|
||||
#author: //p[text() = 'By ']/a/text()
|
||||
#strip: //p[text() = 'By ']
|
||||
|
||||
date: //p[contains(@class, 'date-header')]
|
||||
|
||||
body: //div[@class="article"]
|
||||
strip_id_or_class: lazy
|
||||
#strip_id_or_class: minor
|
||||
strip_id_or_class: multipagefooter
|
||||
strip_id_or_class: date-header
|
||||
strip_id_or_class: byline
|
||||
|
||||
find_string: <noscript>
|
||||
replace_string: <div>
|
||||
find_string: </noscript>
|
||||
replace_string: </div>
|
||||
|
||||
test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html
|
||||
test_url: http://appleinsider.com/articles/13/10/03/goldee-companion-app-for-philips-hue-bulbs-offers-shifting-dynamic-light-scenes
|
||||
test_url: http://appleinsider.com/appleinsider.rss
|
0
inc/3rdparty/site_config/standard/appleweblog.com.txt
vendored
Normal file → Executable file
0
inc/3rdparty/site_config/standard/appleweblog.com.txt
vendored
Normal file → Executable file
6
inc/3rdparty/site_config/standard/archdaily.com.txt
vendored
Normal file → Executable file
6
inc/3rdparty/site_config/standard/archdaily.com.txt
vendored
Normal file → Executable file
@ -1,5 +1,5 @@
|
||||
date: //div[@class='post_date']
|
||||
|
||||
body: //div[@class='post_content']
|
||||
date: //div[@class='post_date']
|
||||
|
||||
body: //div[@class='post_content']
|
||||
|
||||
test_url: http://www.archdaily.com/185325/p10-mixed-use-building-studio-up
|
38
inc/3rdparty/site_config/standard/archiveofourown.org.txt
vendored
Normal file → Executable file
38
inc/3rdparty/site_config/standard/archiveofourown.org.txt
vendored
Normal file → Executable file
@ -1,18 +1,22 @@
|
||||
# Description: Fix XPaths to include ALL chapters on 'view_full_work' pages.
|
||||
# Include: work meta, summary, chapter information, and notes which Instapaper strips out on default.
|
||||
# Exclude: header, footer, navigation, comments.
|
||||
# Notes: User is a newbie with XPaths.
|
||||
|
||||
title: //h2[@class='title']
|
||||
author: //h3[@class='byline']
|
||||
author: //a[@class='login author']
|
||||
|
||||
strip_id_or_class:header
|
||||
strip_id_or_class:navigation
|
||||
strip_id_or_class:feedback
|
||||
strip_id_or_class:kudos
|
||||
strip_id_or_class:add_comment_placeholder
|
||||
strip_id_or_class:add_comment
|
||||
strip_id_or_class:globalize
|
||||
# Description: Fix XPaths to include ALL chapters on 'view_full_work' pages.
|
||||
# Include: work meta, summary, chapter information, and notes which Instapaper strips out on default.
|
||||
# Exclude: header, footer, navigation, comments.
|
||||
# Notes: User is a newbie with XPaths.
|
||||
|
||||
title: //h2[@class='title']
|
||||
author: //h3[@class='byline']
|
||||
author: //a[@class='login author']
|
||||
|
||||
strip_id_or_class:header
|
||||
strip_id_or_class:navigation
|
||||
strip_id_or_class:feedback
|
||||
strip_id_or_class:kudos
|
||||
strip_id_or_class:add_comment_placeholder
|
||||
strip_id_or_class:add_comment
|
||||
strip_id_or_class:globalize
|
||||
strip_id_or_class:footer
|
||||
test_url: http://archiveofourown.org/works/229402?view_full_work=true
|
||||
|
||||
single_page_link: //div[@id='main']//a[contains(@href, 'view_adult=true')]
|
||||
|
||||
test_url: http://archiveofourown.org/works/229402?view_full_work=true
|
||||
test_url: http://archiveofourown.org/works/750111/chapters/1399929
|
33
inc/3rdparty/site_config/standard/arstechnica.com.txt
vendored
Normal file → Executable file
33
inc/3rdparty/site_config/standard/arstechnica.com.txt
vendored
Normal file → Executable file
@ -1,16 +1,17 @@
|
||||
author: //p[@class='byline']/a
|
||||
body: //div[contains(@class,'article-content')]
|
||||
strip: //h2[@class='title']
|
||||
strip_id_or_class: byline
|
||||
prune: no
|
||||
|
||||
date: //div[@class='byline']/span[@class='posted']//abbr/@original-title
|
||||
date: //div[@class='byline']/span[@class='posted']//abbr
|
||||
|
||||
title: //div[@id='story']//h2[@class='title']
|
||||
|
||||
strip: //div[@class='pager']
|
||||
next_page_link: //nav//a[span/@class='next']/@href
|
||||
|
||||
test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars
|
||||
test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/
|
||||
author: //p[@class='byline']/a
|
||||
body: //div[contains(@class,'article-content')]
|
||||
strip: //h2[@class='title']
|
||||
strip_id_or_class: byline
|
||||
strip_id_or_class: story-sidebar
|
||||
prune: no
|
||||
|
||||
date: //div[@class='byline']/span[@class='posted']//abbr/@original-title
|
||||
date: //div[@class='byline']/span[@class='posted']//abbr
|
||||
|
||||
title: //div[@id='story']//h2[@class='title']
|
||||
|
||||
strip: //div[@class='pager']
|
||||
next_page_link: //nav//a[span/@class='next']/@href
|
||||
|
||||
test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars
|
||||
test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/
|
||||
|
8
inc/3rdparty/site_config/standard/articles.boston.com.txt
vendored
Normal file → Executable file
8
inc/3rdparty/site_config/standard/articles.boston.com.txt
vendored
Normal file → Executable file
@ -1,6 +1,6 @@
|
||||
title: //div[@class="mod-bostonarticleheader mod-articleheader"]/h1
|
||||
author: substring-after(//div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[3],"By ")
|
||||
date: //div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[@class="pubdate"]
|
||||
|
||||
title: //div[@class="mod-bostonarticleheader mod-articleheader"]/h1
|
||||
author: substring-after(//div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[3],"By ")
|
||||
date: //div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[@class="pubdate"]
|
||||
|
||||
strip_id_or_class: mod-pagination
|
||||
test_url: http://articles.boston.com/2011-10-23/news/30313691_1_bigfoot-free-speech-monadnock-state-park
|
18
inc/3rdparty/site_config/standard/articles.courant.com.txt
vendored
Normal file → Executable file
18
inc/3rdparty/site_config/standard/articles.courant.com.txt
vendored
Normal file → Executable file
@ -1,11 +1,11 @@
|
||||
title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1
|
||||
date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"]
|
||||
author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3]
|
||||
|
||||
strip_id_or_class: mod-article-byline
|
||||
strip_id_or_class: mod-article-header
|
||||
strip_id_or_class: mod-article-subtitle
|
||||
#This leaves some crud after the article, but it's better than nothing.
|
||||
#It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element.
|
||||
title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1
|
||||
date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"]
|
||||
author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3]
|
||||
|
||||
strip_id_or_class: mod-article-byline
|
||||
strip_id_or_class: mod-article-header
|
||||
strip_id_or_class: mod-article-subtitle
|
||||
#This leaves some crud after the article, but it's better than nothing.
|
||||
#It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element.
|
||||
|
||||
test_url: http://articles.courant.com/2011-10-22/news/hc-green-drugsearch--1022-20111022_1_drugs-in-student-lockers-police-dogs-lockdown
|
11
inc/3rdparty/site_config/standard/articles.washingtonpost.com.txt
vendored
Executable file
11
inc/3rdparty/site_config/standard/articles.washingtonpost.com.txt
vendored
Executable file
@ -0,0 +1,11 @@
|
||||
body: //div[contains(@class, "article_body")]
|
||||
# print view
|
||||
body: //div[@id='print_facet']//div[@id='body']
|
||||
|
||||
tidy: no
|
||||
prune: no
|
||||
|
||||
single_page_link: concat(substring-before(//div[@id="echo_container_a"]/@guid, '_story.html'), '_print.html')
|
||||
|
||||
test_url: http://articles.washingtonpost.com/2011-10-22/world/35279694_1_germany-acts-german-leaders-chancellor-angela-merkel
|
||||
test_url: http://articles.washingtonpost.com/2013-05-31/opinions/39658000_1_chemical-weapons-mass-destruction-cartels
|
2
inc/3rdparty/site_config/standard/asahi.com.txt
vendored
Normal file → Executable file
2
inc/3rdparty/site_config/standard/asahi.com.txt
vendored
Normal file → Executable file
@ -1,3 +1,3 @@
|
||||
body: //div[@id='HeadLine']
|
||||
body: //div[@id='HeadLine']
|
||||
strip: //div[@id='utility_right']
|
||||
test_url: http://www.asahi.com/culture/update/0520/TKY201105200321.html
|
6
inc/3rdparty/site_config/standard/ascarter.net.txt
vendored
Normal file → Executable file
6
inc/3rdparty/site_config/standard/ascarter.net.txt
vendored
Normal file → Executable file
@ -1,5 +1,5 @@
|
||||
title: //h1[@class='article_title']
|
||||
author: //span[@class='author']
|
||||
date: //h2[@class='dateline']
|
||||
title: //h1[@class='article_title']
|
||||
author: //span[@class='author']
|
||||
date: //h2[@class='dateline']
|
||||
body: //div[@class='article_body']
|
||||
test_url: http://ascarter.net/2012/02/20/enough-is-enough.html
|
10
inc/3rdparty/site_config/standard/astronews.com.txt
vendored
Normal file → Executable file
10
inc/3rdparty/site_config/standard/astronews.com.txt
vendored
Normal file → Executable file
@ -1,7 +1,7 @@
|
||||
title: //span[@class='titel']
|
||||
author: //span[@class='metadaten_C']/a//span[@class='metadaten_C']
|
||||
date: substring-after(//span[@class='metadaten_C'],'astronews.com')
|
||||
strip: //span[@class='bu']
|
||||
strip_image_src: '/_images/'
|
||||
title: //span[@class='titel']
|
||||
author: //span[@class='metadaten_C']/a//span[@class='metadaten_C']
|
||||
date: substring-after(//span[@class='metadaten_C'],'astronews.com')
|
||||
strip: //span[@class='bu']
|
||||
strip_image_src: '/_images/'
|
||||
|
||||
test_url: http://www.astronews.com/news/artikel/2011/10/1110-021.shtml
|
12
inc/3rdparty/site_config/standard/asymco.com.txt
vendored
Normal file → Executable file
12
inc/3rdparty/site_config/standard/asymco.com.txt
vendored
Normal file → Executable file
@ -1,8 +1,8 @@
|
||||
# Johannes Stühler
|
||||
|
||||
title://h2
|
||||
author://span[@class='meta-content']
|
||||
date://abbr[@class='date published']/@title
|
||||
body://div[@class='entry-content']
|
||||
# Johannes Stühler
|
||||
|
||||
title://h2
|
||||
author://span[@class='meta-content']
|
||||
date://abbr[@class='date published']/@title
|
||||
body://div[@class='entry-content']
|
||||
|
||||
test_url: http://www.asymco.com/2011/01/14/is-android-more-efficient-than-ios-at-generating-search-revenue/
|
8
inc/3rdparty/site_config/standard/autoblog.com.txt
vendored
Normal file → Executable file
8
inc/3rdparty/site_config/standard/autoblog.com.txt
vendored
Normal file → Executable file
@ -1,6 +1,6 @@
|
||||
prune: no
|
||||
body: //div[@class='post-body']
|
||||
author: //p[@class='byline']//a
|
||||
date: substring-after(//div[@class='about']/p[2], 'Posted')
|
||||
prune: no
|
||||
body: //div[@class='post-body']
|
||||
author: //p[@class='byline']//a
|
||||
date: substring-after(//div[@class='about']/p[2], 'Posted')
|
||||
strip: //div[@class='body']/div[@class='meta']
|
||||
test_url: http://www.autoblog.com/2012/01/17/next-gen-bmw-x5-caught-again/
|
4
inc/3rdparty/site_config/standard/avclub.com.txt
vendored
Normal file → Executable file
4
inc/3rdparty/site_config/standard/avclub.com.txt
vendored
Normal file → Executable file
@ -1,4 +1,4 @@
|
||||
author: //*[@id="article_wrapper"]/div[1]/a[1]
|
||||
body: //*[@id="article_wrapper"]/div[2]
|
||||
author: //*[@id="article_wrapper"]/div[1]/a[1]
|
||||
body: //*[@id="article_wrapper"]/div[2]
|
||||
date: //*[@id="article_wrapper"]/div[1]/text()[2]
|
||||
test_url: http://www.avclub.com/articles/forgetmenot,70904
|
20
inc/3rdparty/site_config/standard/baltimoresun.com.txt
vendored
Normal file → Executable file
20
inc/3rdparty/site_config/standard/baltimoresun.com.txt
vendored
Normal file → Executable file
@ -1,12 +1,12 @@
|
||||
single_page_link: //div[@class='toppaginate']//a[@rel='nofollow']
|
||||
convert_double_br_tags: yes
|
||||
|
||||
title: //div[@class="story"]/h1
|
||||
body: //div[@id="story-body-text"]
|
||||
author: //span[@class="byline"]
|
||||
date: //p[@class="date"]
|
||||
|
||||
strip: //*[@class='all']
|
||||
strip: //*[@class='articlerail']
|
||||
single_page_link: //div[@class='toppaginate']//a[@rel='nofollow']
|
||||
convert_double_br_tags: yes
|
||||
|
||||
title: //div[@class="story"]/h1
|
||||
body: //div[@id="story-body-text"]
|
||||
author: //span[@class="byline"]
|
||||
date: //p[@class="date"]
|
||||
|
||||
strip: //*[@class='all']
|
||||
strip: //*[@class='articlerail']
|
||||
|
||||
test_url: http://www.baltimoresun.com/news/maryland/bs-md-omalley-budget-2-20120116,0,5340585.story
|
13
inc/3rdparty/site_config/standard/baseballprospectus.com.txt
vendored
Executable file
13
inc/3rdparty/site_config/standard/baseballprospectus.com.txt
vendored
Executable file
@ -0,0 +1,13 @@
|
||||
title: //h1[@class='title']
|
||||
author: //p[@class="author"]/a[1]
|
||||
body: //div[@class="article"]
|
||||
date: //p[@class="date"]
|
||||
|
||||
# remove user tools
|
||||
strip: //div[@class='tools']
|
||||
strip: //h1
|
||||
strip: //h2[@class='subtitle']
|
||||
strip: //p[@class='author']
|
||||
strip: //p[@class='date']
|
||||
|
||||
test_url: http://www.baseballprospectus.com/article.php?articleid=18463
|
10
inc/3rdparty/site_config/standard/basicthinking.de.txt
vendored
Normal file → Executable file
10
inc/3rdparty/site_config/standard/basicthinking.de.txt
vendored
Normal file → Executable file
@ -1,7 +1,7 @@
|
||||
title: //h2
|
||||
date: //span[@class='date']
|
||||
body: //div[@class='entry']
|
||||
|
||||
strip: //div[@class='zusatz']
|
||||
title: //h2
|
||||
date: //span[@class='date']
|
||||
body: //div[@class='entry']
|
||||
|
||||
strip: //div[@class='zusatz']
|
||||
|
||||
test_url: http://www.basicthinking.de/blog/2011/12/13/sagt-social-networks-adieu-begrust-private-networks/
|
22
inc/3rdparty/site_config/standard/bb.is.txt
vendored
Normal file → Executable file
22
inc/3rdparty/site_config/standard/bb.is.txt
vendored
Normal file → Executable file
@ -1,13 +1,13 @@
|
||||
author: substring(//h3[@class='headlines']/span[@class='dates'],0,string-length(//h3[@class='headlines']/span[@class='dates'])-20)
|
||||
|
||||
|
||||
date: substring((//h3[@class='headlines']/span[@class='dates']),string-length(//h3[@class='headlines']/span[@class='dates'])-18,12)
|
||||
|
||||
|
||||
body: //div[@class='first-article-big']
|
||||
strip: //table[@class='newsimagecontainer']
|
||||
strip: //h3[@class='headlines']
|
||||
strip: //iframe[@class='headlines']
|
||||
strip: //a[@class='newslink']
|
||||
author: substring(//h3[@class='headlines']/span[@class='dates'],0,string-length(//h3[@class='headlines']/span[@class='dates'])-20)
|
||||
|
||||
|
||||
date: substring((//h3[@class='headlines']/span[@class='dates']),string-length(//h3[@class='headlines']/span[@class='dates'])-18,12)
|
||||
|
||||
|
||||
body: //div[@class='first-article-big']
|
||||
strip: //table[@class='newsimagecontainer']
|
||||
strip: //h3[@class='headlines']
|
||||
strip: //iframe[@class='headlines']
|
||||
strip: //a[@class='newslink']
|
||||
convert_double_br_tags: yes
|
||||
test_url: http://bb.is/Pages/82?NewsID=174119
|
74
inc/3rdparty/site_config/standard/bbc.co.uk.txt
vendored
Normal file → Executable file
74
inc/3rdparty/site_config/standard/bbc.co.uk.txt
vendored
Normal file → Executable file
@ -1,32 +1,42 @@
|
||||
body: //div[@class="story-body"]
|
||||
title: //h1[@class="story-header"]
|
||||
date: //span[@class="story-date"]/span[@class='date']
|
||||
|
||||
# recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055
|
||||
body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1']
|
||||
|
||||
#strip: //div[@class="story-feature narrow"]
|
||||
#strip: //div[@class="story-feature wide"]
|
||||
#strip: //div[@class="story-feature dslideshow-enclosure"]
|
||||
strip: //div[contains(@class, "story-feature")]
|
||||
strip: //span[@class="story-date"]
|
||||
#strip: //div[@class="caption body-narrow-width"]
|
||||
strip: //div[@class="warning"]//p
|
||||
strip: //div[@id='page-bookmark-links-head']
|
||||
strip: //object
|
||||
strip: //div[contains(@class, "bbccom_advert_placeholder")]
|
||||
strip: //div[contains(@class, "embedded-hyper")]
|
||||
strip: //div[contains(@class, 'market-data')]
|
||||
strip: //a[contains(@class, 'hidden')]
|
||||
strip: //div[contains(@class, 'hypertabs')]
|
||||
strip: //div[contains(@class, 'related')]
|
||||
strip: //form[@id='comment-form']
|
||||
strip: //div[contains(@class, 'comment-introduction')]
|
||||
|
||||
replace_string(<noscript>): <div>
|
||||
replace_string(</noscript>): </div>
|
||||
|
||||
prune: no
|
||||
|
||||
dissolve: //h2
|
||||
test_url: http://www.bbc.co.uk/news/business-15060862
|
||||
body: //div[@class="story-body"]
|
||||
# for video entries
|
||||
body: //div[contains(@class, "videoInStory") or @id="meta-information"]
|
||||
title: //h1[@class="story-header"]
|
||||
date: //span[@class="story-date"]/span[@class='date']
|
||||
# for sport site
|
||||
date: //meta[@name='DCTERMS.created']/@content
|
||||
author: //div[@id='headline']//span[@class='byline-name']
|
||||
|
||||
# recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055
|
||||
body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1']
|
||||
|
||||
#strip: //div[@class="story-feature narrow"]
|
||||
#strip: //div[@class="story-feature wide"]
|
||||
#strip: //div[@class="story-feature dslideshow-enclosure"]
|
||||
strip: //div[contains(@class, "story-feature")]
|
||||
strip: //span[@class="story-date"]
|
||||
#strip: //div[@class="caption body-narrow-width"]
|
||||
strip: //div[@class="warning"]//p
|
||||
strip: //div[@id='page-bookmark-links-head']
|
||||
strip: //object
|
||||
strip: //div[contains(@class, "bbccom_advert_placeholder")]
|
||||
strip: //div[contains(@class, "embedded-hyper")]
|
||||
strip: //div[contains(@class, 'market-data')]
|
||||
strip: //a[contains(@class, 'hidden')]
|
||||
strip: //div[contains(@class, 'hypertabs')]
|
||||
strip: //div[contains(@class, 'related')]
|
||||
strip: //form[@id='comment-form']
|
||||
strip: //div[contains(@class, 'comment-introduction')]
|
||||
strip: //div[contains(@class, 'share-tools')]
|
||||
strip: //div[@id='also-related-links']
|
||||
|
||||
replace_string(<noscript>): <div>
|
||||
replace_string(</noscript>): </div>
|
||||
|
||||
prune: no
|
||||
|
||||
dissolve: //h2
|
||||
test_url: http://www.bbc.co.uk/sport/0/football/23224017
|
||||
test_url: http://www.bbc.co.uk/news/business-15060862
|
||||
# video entry
|
||||
test_url: http://www.bbc.co.uk/news/world-asia-22056933
|
16
inc/3rdparty/site_config/standard/bbcgoodfood.com.txt
vendored
Executable file
16
inc/3rdparty/site_config/standard/bbcgoodfood.com.txt
vendored
Executable file
@ -0,0 +1,16 @@
|
||||
title: //header//h1
|
||||
#body: //article[contains(@class, 'node-full')]
|
||||
body: //div[contains(@class, 'recipe-details') or contains(@class, 'tips-carousel')] | //section[@id='recipe-ingredients' or @id='recipe-method']
|
||||
|
||||
strip_id_or_class: recipe-rating-wrapper
|
||||
strip_id_or_class: magazine-subcribe-header
|
||||
strip_id_or_class: hide
|
||||
strip_id_or_class: recipe-actions
|
||||
strip_id_or_class: buy-ingredients
|
||||
strip_id_or_class: related-content
|
||||
strip_id_or_class: recipe-magazine-ad
|
||||
strip_id_or_class: copy-right
|
||||
|
||||
prune: no
|
||||
|
||||
test_url: http://www.bbcgoodfood.com/recipes/1131634/minced-beef-wellington
|
28
inc/3rdparty/site_config/standard/benoitmaison.org.txt
vendored
Normal file → Executable file
28
inc/3rdparty/site_config/standard/benoitmaison.org.txt
vendored
Normal file → Executable file
@ -1,16 +1,16 @@
|
||||
body: //div[@class="entry-content"]
|
||||
|
||||
# Remove text ‘Tweet’
|
||||
strip: //div[@class="entry-content"]/div[last()]
|
||||
|
||||
title: h1[@class="entry-title"]
|
||||
|
||||
# If the Instapaper text parser worked with HTML5 tags, we would use:
|
||||
date: //time[@class="entry-date"]
|
||||
|
||||
# But since it does not, use this more complicated rule:
|
||||
date: //div[@class="entry-meta"]/a[@rel="bookmark"]
|
||||
|
||||
# Unfortunately, the following rule is overridden by the automatically found author.
|
||||
body: //div[@class="entry-content"]
|
||||
|
||||
# Remove text ‘Tweet’
|
||||
strip: //div[@class="entry-content"]/div[last()]
|
||||
|
||||
title: h1[@class="entry-title"]
|
||||
|
||||
# If the Instapaper text parser worked with HTML5 tags, we would use:
|
||||
date: //time[@class="entry-date"]
|
||||
|
||||
# But since it does not, use this more complicated rule:
|
||||
date: //div[@class="entry-meta"]/a[@rel="bookmark"]
|
||||
|
||||
# Unfortunately, the following rule is overridden by the automatically found author.
|
||||
author: ("Benoit Maison")
|
||||
test_url: http://www.benoitmaison.org/2011/12/06/why-siri-had-to-start-in-beta/
|
2
inc/3rdparty/site_config/standard/berlingske.dk.txt
vendored
Normal file → Executable file
2
inc/3rdparty/site_config/standard/berlingske.dk.txt
vendored
Normal file → Executable file
@ -1,3 +1,3 @@
|
||||
title: //h1[@class='headline']
|
||||
title: //h1[@class='headline']
|
||||
body: //div[contains(@class, 'article-wrapper')]
|
||||
test_url: http://www.berlingske.dk/danmark/festen-er-flyttet-nordpaa
|
5
inc/3rdparty/site_config/standard/bernama.com.txt
vendored
Executable file
5
inc/3rdparty/site_config/standard/bernama.com.txt
vendored
Executable file
@ -0,0 +1,5 @@
|
||||
body: //div[contains(@class, "NewsText"]
|
||||
prune: no
|
||||
|
||||
test_url: http://www.bernama.com/bernama/v7/rss/english.php
|
||||
test_url: http://www.bernama.com/bernama/v7/newsindex.php?id=943513
|
0
inc/3rdparty/site_config/standard/betabeat.com.txt
vendored
Normal file → Executable file
0
inc/3rdparty/site_config/standard/betabeat.com.txt
vendored
Normal file → Executable file
10
inc/3rdparty/site_config/standard/betanews.com.txt
vendored
Normal file → Executable file
10
inc/3rdparty/site_config/standard/betanews.com.txt
vendored
Normal file → Executable file
@ -1,7 +1,7 @@
|
||||
# some articles at this site like this one doesn't
|
||||
# seem to pick up the article body via normal
|
||||
# processing, other articles come through fine
|
||||
# http://www.betanews.com/joewilcox/article
|
||||
# /Google-is-a-marketing-sensation/1309708375
|
||||
# some articles at this site like this one doesn't
|
||||
# seem to pick up the article body via normal
|
||||
# processing, other articles come through fine
|
||||
# http://www.betanews.com/joewilcox/article
|
||||
# /Google-is-a-marketing-sensation/1309708375
|
||||
body: //*[@id="article"]
|
||||
test_url: http://www.betanews.com/joewilcox/article/Google-is-a-marketing-sensation/1309708375
|
12
inc/3rdparty/site_config/standard/biography.com.txt
vendored
Normal file → Executable file
12
inc/3rdparty/site_config/standard/biography.com.txt
vendored
Normal file → Executable file
@ -1,8 +1,8 @@
|
||||
title: //div[contains(@class, 'main-content')]//h1
|
||||
body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')]
|
||||
|
||||
prune: no
|
||||
|
||||
single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')]
|
||||
title: //div[contains(@class, 'main-content')]//h1
|
||||
body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')]
|
||||
|
||||
prune: no
|
||||
|
||||
single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')]
|
||||
|
||||
test_url: http://www.biography.com/print/profile/martin-luther-9389283
|
0
inc/3rdparty/site_config/standard/bitelia.com.txt
vendored
Normal file → Executable file
0
inc/3rdparty/site_config/standard/bitelia.com.txt
vendored
Normal file → Executable file
13
inc/3rdparty/site_config/standard/bizjournals.com.txt
vendored
Executable file
13
inc/3rdparty/site_config/standard/bizjournals.com.txt
vendored
Executable file
@ -0,0 +1,13 @@
|
||||
date: //meta[@name='publish-date']/@content
|
||||
body: //div[contains(@class, 'articleContentWrapper')]
|
||||
prune: no
|
||||
|
||||
strip: //div[contains(@class, 'staff_info')]//dd[contains(., 'Twitter')]
|
||||
|
||||
strip_id_or_class: related_content
|
||||
strip_id_or_class: enlarge
|
||||
strip_id_or_class: photoBy
|
||||
strip_id_or_class: older
|
||||
|
||||
test_url: http://www.bizjournals.com/cincinnati/news/2013/10/03/harris-teeter-shareholders-vote-on.html
|
||||
test_url: http://feeds.bizjournals.com/industry_20?format=xml
|
10
inc/3rdparty/site_config/standard/bjango.com.txt
vendored
Normal file → Executable file
10
inc/3rdparty/site_config/standard/bjango.com.txt
vendored
Normal file → Executable file
@ -1,7 +1,7 @@
|
||||
title: //h1[@class='articlehead']
|
||||
body: //div[@class='column']
|
||||
strip: //h1
|
||||
strip: //div[@class='help']
|
||||
|
||||
title: //h1[@class='articlehead']
|
||||
body: //div[@class='column']
|
||||
strip: //h1
|
||||
strip: //div[@class='help']
|
||||
|
||||
#no author or date/time provided in current layout
|
||||
test_url: http://bjango.com/articles/actions/
|
12
inc/3rdparty/site_config/standard/blog.arsln.org.txt
vendored
Normal file → Executable file
12
inc/3rdparty/site_config/standard/blog.arsln.org.txt
vendored
Normal file → Executable file
@ -1,8 +1,8 @@
|
||||
tidy: no
|
||||
prune: no
|
||||
date: //article/header/h6/time
|
||||
title: //article/header/h3
|
||||
author: //meta[@name='author']/@content
|
||||
body: //article//post
|
||||
tidy: no
|
||||
prune: no
|
||||
date: //article/header/h6/time
|
||||
title: //article/header/h3
|
||||
author: //meta[@name='author']/@content
|
||||
body: //article//post
|
||||
|
||||
test_url: http://blog.arsln.org/aska-ayip-oluyor/
|
10
inc/3rdparty/site_config/standard/blog.asmartbear.com.txt
vendored
Normal file → Executable file
10
inc/3rdparty/site_config/standard/blog.asmartbear.com.txt
vendored
Normal file → Executable file
@ -1,7 +1,7 @@
|
||||
title: //title
|
||||
author: //span[@class='author vcard']/a
|
||||
date: //p[@class='headline_meta']/abbr[@class='published']
|
||||
body: //div[@class='format_text entry-content']
|
||||
|
||||
title: //title
|
||||
author: //span[@class='author vcard']/a
|
||||
date: //p[@class='headline_meta']/abbr[@class='published']
|
||||
body: //div[@class='format_text entry-content']
|
||||
|
||||
strip: //div[@id='dd_ajax_float']
|
||||
test_url: http://blog.asmartbear.com/how-to-get-quality-freelance-graphics-design-work-on-a-budget.html
|
14
inc/3rdparty/site_config/standard/blog.cloudflare.com.txt
vendored
Normal file → Executable file
14
inc/3rdparty/site_config/standard/blog.cloudflare.com.txt
vendored
Normal file → Executable file
@ -1,9 +1,9 @@
|
||||
# Instapaper gets this back to front and only gets the blog title instead of the article title.
|
||||
title: substring-before(//title, '-')
|
||||
|
||||
author: //a[ contains(@href, '/people') ]
|
||||
|
||||
body: //div[ @class='post' ]
|
||||
|
||||
# Instapaper gets this back to front and only gets the blog title instead of the article title.
|
||||
title: substring-before(//title, '-')
|
||||
|
||||
author: //a[ contains(@href, '/people') ]
|
||||
|
||||
body: //div[ @class='post' ]
|
||||
|
||||
# Date is impossible to retrieve since they use those stupid "fuzzy" dates, inserted through javascript, at posterous.
|
||||
test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n
|
6
inc/3rdparty/site_config/standard/blog.fefe.de.txt
vendored
Normal file → Executable file
6
inc/3rdparty/site_config/standard/blog.fefe.de.txt
vendored
Normal file → Executable file
@ -1,5 +1,5 @@
|
||||
title: //h2
|
||||
date: //h3
|
||||
body: //ul
|
||||
title: //h2
|
||||
date: //h3
|
||||
body: //ul
|
||||
|
||||
test_url: http://blog.fefe.de/?ts=b063bf55
|
18
inc/3rdparty/site_config/standard/blog.instagram.com.txt
vendored
Normal file → Executable file
18
inc/3rdparty/site_config/standard/blog.instagram.com.txt
vendored
Normal file → Executable file
@ -1,11 +1,11 @@
|
||||
# clean Instagram blog a little bit
|
||||
|
||||
tidy:no
|
||||
prune:no
|
||||
|
||||
body://div[contains(@id,'content')]
|
||||
|
||||
strip_id_or_class:meta
|
||||
strip_id_or_class:notes
|
||||
# clean Instagram blog a little bit
|
||||
|
||||
tidy:no
|
||||
prune:no
|
||||
|
||||
body://div[contains(@id,'content')]
|
||||
|
||||
strip_id_or_class:meta
|
||||
strip_id_or_class:notes
|
||||
strip_id_or_class:pagination
|
||||
test_url: http://blog.instagram.com/post/8757832007/fromwhereistand
|
9
inc/3rdparty/site_config/standard/blog.instapaper.com.txt
vendored
Executable file
9
inc/3rdparty/site_config/standard/blog.instapaper.com.txt
vendored
Executable file
@ -0,0 +1,9 @@
|
||||
author: //a[@href="http://www.marco.org/about"]
|
||||
date: //span[@class="date"]
|
||||
|
||||
# Remove the date from article body.
|
||||
strip: //span[@class="date"]
|
||||
|
||||
# Remove pagination links from article body.
|
||||
strip: //div[@id="pagination"]
|
||||
test_url: http://blog.instapaper.com/post/31303984531
|
4
inc/3rdparty/site_config/standard/blog.jaysalvat.com.txt
vendored
Normal file → Executable file
4
inc/3rdparty/site_config/standard/blog.jaysalvat.com.txt
vendored
Normal file → Executable file
@ -1,4 +1,4 @@
|
||||
date: //span[contains(@class, 'date-links')]
|
||||
author: //span[contains(@class, 'author-links')]
|
||||
date: //span[contains(@class, 'date-links')]
|
||||
author: //span[contains(@class, 'author-links')]
|
||||
body: //div[contains(@class, 'entry-content')]
|
||||
test_url: http://blog.jaysalvat.com/article/celui-qui-avait-refait-son-site-web
|
6
inc/3rdparty/site_config/standard/blog.kaelig.fr.txt
vendored
Normal file → Executable file
6
inc/3rdparty/site_config/standard/blog.kaelig.fr.txt
vendored
Normal file → Executable file
@ -1,5 +1,5 @@
|
||||
body: //*[contains(@class, 'post_content')]
|
||||
author: string('Kaelig Deloumeau-Prigent')
|
||||
title: //h1[@class='title']
|
||||
body: //*[contains(@class, 'post_content')]
|
||||
author: string('Kaelig Deloumeau-Prigent')
|
||||
title: //h1[@class='title']
|
||||
date: //span[@class='date']
|
||||
test_url: http://blog.kaelig.fr/post/24877648508/preprocesseurs-css-renoncer-par-choix-ou-par
|
8
inc/3rdparty/site_config/standard/blog.naver.com.txt
vendored
Normal file → Executable file
8
inc/3rdparty/site_config/standard/blog.naver.com.txt
vendored
Normal file → Executable file
@ -1,6 +1,6 @@
|
||||
title: //span[@class='pcol1 itemSubjectBoldfont']
|
||||
body: //div[@id='postListBody']
|
||||
date: //p[@class='date fil5 pcol2']
|
||||
single_page_link: /html/frameset/frame[1]/attribute::src
|
||||
title: //span[@class='pcol1 itemSubjectBoldfont']
|
||||
body: //div[@id='postListBody']
|
||||
date: //p[@class='date fil5 pcol2']
|
||||
single_page_link: /html/frameset/frame[1]/attribute::src
|
||||
strip: //div[@class='post-btn']
|
||||
test_url: http://blog.naver.com/how2invest/110135068757
|
20
inc/3rdparty/site_config/standard/blog.pchome.net.txt
vendored
Normal file → Executable file
20
inc/3rdparty/site_config/standard/blog.pchome.net.txt
vendored
Normal file → Executable file
@ -1,12 +1,12 @@
|
||||
# PCHOME blog, a popular Chinese blog host
|
||||
# Oct 15, 2011
|
||||
#
|
||||
|
||||
title://*[contains(@class,'imp')]/h2
|
||||
|
||||
date://*[contains(@class,'imp')]/span
|
||||
body://div[contains(@id,'blog_content')]
|
||||
|
||||
|
||||
# PCHOME blog, a popular Chinese blog host
|
||||
# Oct 15, 2011
|
||||
#
|
||||
|
||||
title://*[contains(@class,'imp')]/h2
|
||||
|
||||
date://*[contains(@class,'imp')]/span
|
||||
body://div[contains(@id,'blog_content')]
|
||||
|
||||
|
||||
|
||||
test_url: http://blog.pchome.net/article/462502.html
|
8
inc/3rdparty/site_config/standard/blog.pinboard.in.txt
vendored
Normal file → Executable file
8
inc/3rdparty/site_config/standard/blog.pinboard.in.txt
vendored
Normal file → Executable file
@ -1,6 +1,6 @@
|
||||
title: //a[@class="blog_title"]
|
||||
date: //p[@class="when"]/a
|
||||
body: //div[@class="blog_entry"]
|
||||
strip_id_or_class:blog_title
|
||||
title: //a[@class="blog_title"]
|
||||
date: //p[@class="when"]/a
|
||||
body: //div[@class="blog_entry"]
|
||||
strip_id_or_class:blog_title
|
||||
strip_id_or_class:when
|
||||
test_url: http://blog.pinboard.in/2011/11/the_social_graph_is_neither/
|
11
inc/3rdparty/site_config/standard/blog.renren.com.txt
vendored
Executable file
11
inc/3rdparty/site_config/standard/blog.renren.com.txt
vendored
Executable file
@ -0,0 +1,11 @@
|
||||
# This filter is tested on:
|
||||
# http://blog.renren.com/share/224959024/14260739544
|
||||
# http://blog.renren.com/share/231323504/14261768898
|
||||
# http://blog.renren.com/share/230305019/1502806705
|
||||
|
||||
title://h1[contains(@class, 'title-article')]
|
||||
author://span[contains(@class, 'name')]
|
||||
body://div[contains(@class, 'content-body')]
|
||||
|
||||
convert_double_br_tags:yes
|
||||
test_url: http://blog.renren.com/share/230305019/1502806705
|
50
inc/3rdparty/site_config/standard/blog.sina.com.cn.txt
vendored
Normal file → Executable file
50
inc/3rdparty/site_config/standard/blog.sina.com.cn.txt
vendored
Normal file → Executable file
@ -1,26 +1,26 @@
|
||||
# Sina blog, the most popular blog host in China.
|
||||
# Its source code is horrible.
|
||||
#
|
||||
# Issue:
|
||||
# Only the first image in the article is displayed.
|
||||
# The rest images are replace by a 1x1 transparent gif by sina blog host.
|
||||
#
|
||||
|
||||
title://*[contains(@class,'titName SG_txta')]
|
||||
author://*[contains(@id,'ownernick')]
|
||||
date://*[contains(@class,'time SG_txtc')]
|
||||
body://div[contains(@class,'articalContent')]
|
||||
|
||||
# Remove redundant content which has span class start with "MASS"
|
||||
# Example <span class="MASSf21674ffeef7"></span>
|
||||
strip://span[contains(@class,'MASS')]
|
||||
|
||||
# Remove comment
|
||||
strip://div[contains(@class,'allComm')]
|
||||
|
||||
# Remove hiden text and link
|
||||
strip://ins
|
||||
|
||||
tidy:no
|
||||
convert_double_br_tags:yes
|
||||
# Sina blog, the most popular blog host in China.
|
||||
# Its source code is horrible.
|
||||
#
|
||||
# Issue:
|
||||
# Only the first image in the article is displayed.
|
||||
# The rest images are replace by a 1x1 transparent gif by sina blog host.
|
||||
#
|
||||
|
||||
title://*[contains(@class,'titName SG_txta')]
|
||||
author://*[contains(@id,'ownernick')]
|
||||
date://*[contains(@class,'time SG_txtc')]
|
||||
body://div[contains(@class,'articalContent')]
|
||||
|
||||
# Remove redundant content which has span class start with "MASS"
|
||||
# Example <span class="MASSf21674ffeef7"></span>
|
||||
strip://span[contains(@class,'MASS')]
|
||||
|
||||
# Remove comment
|
||||
strip://div[contains(@class,'allComm')]
|
||||
|
||||
# Remove hiden text and link
|
||||
strip://ins
|
||||
|
||||
tidy:no
|
||||
convert_double_br_tags:yes
|
||||
test_url: http://blog.sina.com.cn/s/blog_5054769e0102dtja.html
|
0
inc/3rdparty/site_config/standard/blog.spu.edu.txt
vendored
Normal file → Executable file
0
inc/3rdparty/site_config/standard/blog.spu.edu.txt
vendored
Normal file → Executable file
10
inc/3rdparty/site_config/standard/blog.wells.ee.txt
vendored
Normal file → Executable file
10
inc/3rdparty/site_config/standard/blog.wells.ee.txt
vendored
Normal file → Executable file
@ -1,6 +1,6 @@
|
||||
title: //h2/a[@class="no-link title"]
|
||||
author: //h2[@id="blog_owner"]
|
||||
date: //time
|
||||
strip: //h2/a[@class="no-link title"]
|
||||
test_url: http://blog.wells.ee/retina
|
||||
title: //h2/a[@class="no-link title"]
|
||||
author: //h2[@id="blog_owner"]
|
||||
date: //time
|
||||
strip: //h2/a[@class="no-link title"]
|
||||
test_url: http://blog.wells.ee/retina
|
||||
test_url: http://blog.wells.ee/skeuomorphism
|
12
inc/3rdparty/site_config/standard/blogs.aljazeera.net.txt
vendored
Normal file → Executable file
12
inc/3rdparty/site_config/standard/blogs.aljazeera.net.txt
vendored
Normal file → Executable file
@ -1,8 +1,8 @@
|
||||
# 2011-08-23 [carlo@...] Initial version.
|
||||
|
||||
author: //div[@id="blogauthordatebox-node"]//a[@title="View user profile."]/text()
|
||||
|
||||
# why yes, I do feel a bit dirty
|
||||
date: substring-before( substring-after( substring-after( //div[@id="blogauthordatebox-node"]//td[3], "on " ), ", "), " " )
|
||||
# 2011-08-23 [carlo@...] Initial version.
|
||||
|
||||
author: //div[@id="blogauthordatebox-node"]//a[@title="View user profile."]/text()
|
||||
|
||||
# why yes, I do feel a bit dirty
|
||||
date: substring-before( substring-after( substring-after( //div[@id="blogauthordatebox-node"]//td[3], "on " ), ", "), " " )
|
||||
|
||||
test_url: http://blogs.aljazeera.net/asia/2011/08/22/peoples-hero
|
0
inc/3rdparty/site_config/standard/blogs.forbes.com.txt
vendored
Normal file → Executable file
0
inc/3rdparty/site_config/standard/blogs.forbes.com.txt
vendored
Normal file → Executable file
6
inc/3rdparty/site_config/standard/blogs.hbr.org.txt
vendored
Normal file → Executable file
6
inc/3rdparty/site_config/standard/blogs.hbr.org.txt
vendored
Normal file → Executable file
@ -1,4 +1,4 @@
|
||||
title: //div[@id='pageFeature']/h1
|
||||
body: //div[@id='articleBody']
|
||||
strip: //div[@class='module wide']
|
||||
title: //div[@id='pageFeature']/h1
|
||||
body: //div[@id='articleBody']
|
||||
strip: //div[@class='module wide']
|
||||
test_url: http://blogs.hbr.org/bregman/2011/04/the-1-killer-of-meetings-and-w.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+harvardbusiness+%28HBR.org%29
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user