[add] new specific configuration files

This commit is contained in:
Nicolas Lœuillet 2013-12-06 10:13:03 +01:00
parent d5501950e2
commit ac4d114214
773 changed files with 6982 additions and 0 deletions

View File

@ -0,0 +1,6 @@
title: //div[@class='post_header']//h2/a
author: //span[@class='author']
date: //span[@class='date']
body: //div[@id='Content']
test_url: http://37signals.com/svn/posts/2785-the-end-of-the-it-department

View File

@ -0,0 +1,9 @@
body: //div[@class='content']
date: //div[@class='content']/h2
strip: //div[@class='content']/h2
title: //div[@class='content']/h3
strip: //div[@id='postmenu']
strip: //div[@class='trackback']
tidy: no
test_url: http://www.3quarksdaily.com/3quarksdaily/2012/01/martin-luther-king-i-have-a-dream.html

View File

@ -0,0 +1,11 @@
body: //div[@id='main']
title: //div[@class='intro']/h1
author: //ul[@class='text-data']/li[@class='author']
date: //ul[@class='text-data']/li[@class='date']
convert_double_br_tags: yes
tidy: no
strip: //div[@class='share']
strip: //*[@class='zoom']
strip: //div[@id='disqus_thread']
test_url: http://3voor12.vpro.nl/nieuws/2012/januari/Ook-website-GroenLinks-woensdag-op-zwart-i-v-m--SOPA.html

View File

@ -0,0 +1,4 @@
body: //*[@class = 'content']
author: //*[@class = 'submitted']/a
date: substring-after(//*[@class = 'submitted']/text(), '|')
test_url: http://www.43folders.com/2011/04/22/cranking

View File

@ -0,0 +1,27 @@
# very loose setup for both 500px.com/photo/* and 500px.com/blog/*
# photo page example: http://500px.com/photo/4181666
# blog page example: http://500px.com/blog/110
# avoid "no text" error
tidy:no
prune:no
# reorganize photo page elements
#body://div[contains(@class,'container')]
move_into(body)://div[contains(@id,'thephoto')]
move_into(body)://div[contains(@id,'description')]
move_into(body)://div[contains(@id,'tags')]
move_into(body)://div[contains(@id,'photo-info')]
# clean photo page info
strip://span[contains(@id,'copyright')]
strip://*[contains(@id,'store')]
strip://*[contains(@id,'user-info')]
strip://*[contains(@id,'photo-stats')]
strip://*[contains(@id,'voting_controls_container')]
strip://*[contains(@id,'more-photos')]
strip://*[contains(@id,'embed-photo')]
# clean blog page side bar
strip://*[contains(@class,'col d3 clearafter')]
test_url: http://500px.com/photo/3641041?from=editors

View File

@ -0,0 +1,2 @@
title: substring-before(//title, '—')
test_url: http://512pixels.net/more-on-linked-lists/

View File

@ -0,0 +1,9 @@
body: //*[@id="episode"]
prune: no
tidy: no
autodetect_next_page: no
strip_id_or_class: player
strip://*[@id="header"]
test_url: http://5by5.tv/buildanalyze/60

View File

@ -0,0 +1,9 @@
title: //h2[@class='border']
body: //div[@class='padding']
convert_double_br_tags: yes
strip: //div[@id='social_sharing']
strip: //div[@class='socialLinks']
test_url: http://www.944.com/articles/mild-obsessions-frock-la-get-to-know-victoria-tik-s-haute-sustainable-fashion-line/

View File

@ -0,0 +1,10 @@
title: //meta[@property='og:title']/@content
body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
strip_id_or_class: socialshareprivacy1
strip_id_or_class: zvaFacebookButton
tidy: no
prune: no
test_url: http://www.aachener-nachrichten.de/lokales/aachen-detail-an/2517757

View File

@ -0,0 +1,10 @@
title: //meta[@property='og:title']/@content
body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
strip_id_or_class: socialshareprivacy1
strip_id_or_class: zvaFacebookButton
tidy: no
prune: no
test_url: http://www.aachener-zeitung.de/sixcms/detail.php?template=az_detail&id=2552718

View File

@ -0,0 +1,7 @@
title: //meta[@property='og:title']/@content
body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text']
strip_id_or_class: colB
prune: no
test_url: http://www.abc.es/20120209/tv-series/abci-house-ultima-temporada-201202090936.html

View File

@ -0,0 +1,10 @@
title: //h1
author: //div[@class="byline"]/a
date: //span[@class="timestamp"]
strip: //p[@class="topics"]
strip: //h1
strip: //div[@class="byline"]
strip: //p[@class="published"]
strip: //div[contains(@class,"featured-scroller")]
test_url: http://www.abc.net.au/news/2011-11-08/crabb-carbon-legislation-abbott-demolition/3652544

View File

@ -0,0 +1,27 @@
title: //h1[@class='headline']
body: //div[@id='storyText']
# for video entries
body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')]
author: //div[@class='byline']
date: //div[@class='date']
strip: //*[@id='date_partner']
strip: //div[@class='breadcrumb']
strip: //div[contains(@class,'show_tools')]
strip: //div[@id='sponsoredByAd']
strip: //div[contains(@class,'rel_container')]
strip: //p[a[starts-with(@href, 'http://www.twitter.com')]]
strip: //p[a[starts-with(@href, 'http://www.facebook.com')]]
strip: //p[contains(., 'Click here to return to')]
#strip_id_or_class: media
strip_id_or_class: mediaplayer
replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http
prune: no
single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true')
test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744
# multi-page
test_url: http://abcnews.go.com/Blotter/family-freed-american-hostage-somalia-seals-obama/story?id=15439544

View File

@ -0,0 +1,9 @@
title: //div[@id='H_docTitle']
body: //div[@id='H_meta' or @id='H_content' or @id='F_footer']
strip_id_or_class: F_toenail
prune: no
test_url: http://www.accesstoinsight.org/lib/authors/nyanaponika/wheel026.html

View File

@ -0,0 +1,3 @@
body: //div[starts-with(@id, 'news-id-')]
test_url: http://acidcow.com/fun/20933-acid-picdump-83-pics.html

View File

@ -0,0 +1,9 @@
title://h1[@class="title"]
author://div[@class="submitted"]/span/a
date://div[@class="submitted"]/span
body://div[@class="content-wrapper"]
strip://div[@id="skip-link"]
strip://div[@id="region-content-3-3"]
strip://div[@id="section-footer"]
test_url: https://www.acquia.com/blog/drupals-long-warmth-toward-third-party-code

View File

@ -0,0 +1,5 @@
tidy:no
date: //time[@class='updated']
dissolve: //ul[@class='video-gallery']/li
dissolve: //ul[@class='video-gallery']
test_url: http://www.acroswing.fr/actualites/competition_rock/selectif_bellegarde_sur_valserine__2012-02-26.php

View File

@ -0,0 +1,15 @@
body: //div[@id='content']
# clean up recipe pages
strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']
#recipe pages
strip_id_or_class: "recipe-feedback"
strip_id_or_class: "comments"
strip_id_or_class: "procedure-number"
strip_id_or_class: "more-with-author"
#slice
strip_id_or_class: "inner"
test_url: http://aht.seriouseats.com/archives/2009/12/the-burger-lab-salting-ground-beef.html

View File

@ -0,0 +1,2 @@
body: //div[@class="entry"]
test_url: http://alex.mullr.net/blog/2011/05/on-spotify/

View File

@ -0,0 +1,12 @@
title: //h1[@class='title']
author: //h3[@class='byline']/a
date: //div[@class='ishinfo']
body: //*[@id='articletext']
strip_id_or_class: 'ishinfo'
strip_id_or_class: 'metastuff'
strip_id_or_class: 'learnmore'
strip_id_or_class: 'discuss'
prune: no
test_url: http://www.alistapart.com/articles/organizing-mobile/

View File

@ -0,0 +1,8 @@
title: //span[@id='DetailedTitle']
body: //td[@id='tdTextContent']
strip_id_or_class: Skyscrapper_Body
date: //span[@id='ctl00_cphBody_lblDate']
author: //div[@id="dvAuthorInfo"]//a/text()
strip: //table[ tbody/tr/td/object ]
prune: no
test_url: http://www.aljazeera.com/indepth/opinion/2012/01/2012114121925380575.html

View File

@ -0,0 +1,14 @@
title: //h1[@id='itemTitle']
body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')]
strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right']
strip: //div[contains(@class, 'rightcoltoolsdiv')]
strip: //div[contains(@class, 'servings-form')]
strip: //p[@class='nutritional-information']
strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')]
strip: //div[@id='nutri-info']/div[contains(@class, 'title')]
strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter']
strip_id_or_class: eshaAttribute
strip_id_or_class: eshaParagraph
prune: no
test_url: http://allrecipes.com/Recipe/Taco-Pie/Detail.aspx?src=rotd

View File

@ -0,0 +1,10 @@
title://div[@class="article-title"]/h1[@class="title"]
date: //p[@class="article-date"]
body://*[@class="article-body article-text"]
# Trim out related posts at bottom of article
strip://blockquote[@class="memo"]
# Yup, no idea why author won't work...
author://div[@class="page-header article-header clearfix"]/p[@class="title"]
# [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it.
test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/

View File

@ -0,0 +1,8 @@
title: //div[@id='pageHdr']//h1
body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint']
strip: //div[contains(@class, 'infoBox') or @id='infoBox']
single_page_link: //li[@id='print']/a
prune: no
test_url: http://www.allyou.com/budget-home/money-shopping/freebies-online-00400000066392/

View File

@ -0,0 +1,11 @@
body: //div[@class = 'entry']
date: substring-after(//p[@class="date"],'بتاريخ ')
strip_id_or_class: date
strip_id_or_class: follow-single
strip_id_or_class: ratingblock
strip_id_or_class: newRatingHolder
strip_id_or_class: postmetadata
strip_id_or_class: addthis_toolbox
strip_id_or_class: addthis_default_style
strip_id_or_class: size-full
test_url: http://alphabeta.argaam.com/?p=35657

View File

@ -0,0 +1,9 @@
body: //div[@id = "article-view"]
body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')]
author: //p[@class = "author"]
strip: //h1
strip: //h2
strip_id_or_class: author
prune: no
test_url: http://www.alriyadh.com/2011/10/10/article674357.html
test_url: http://www.alriyadh.com/net/article/780935

View File

@ -0,0 +1,2 @@
title: //*[@id='normalfontyellow']
test_url: http://www.alseraj.net/cgi-bin/pros/av/LeqaTextDisplay.cgi?display&2

View File

@ -0,0 +1,2 @@
body: //*[(@class = "historia")]
test_url: http://alt1040.com/2011/09/banda-ancha-en-america-latina-insignificante

View File

@ -0,0 +1,2 @@
body: //*[(@class = "historia")]
test_url: http://altfoto.com/2011/09/nikon-presenta-su-nuevo-sistema-nikon-1-y-dos-nuevas-camaras

View File

@ -0,0 +1,10 @@
title: //h1
author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ")
date: //div/a[contains (@href, "issue")]
move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1]
body: //div[@class="enableBullets"]
test_url: http://alumni.stanford.edu/get/page/magazine/article/?article_id=54819

View File

@ -0,0 +1,19 @@
title: //span[@id = 'btAsinTitle']
body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div
#strip_id_or_class: quantityDropdownDiv
#strip_id_or_class: addToCartSpan
#strip_id_or_class: oneClickDiv
strip_id_or_class: nocontent
strip_id_or_class: masDynamicConten
strip_id_or_class: dynamic-content
prune: no
find_string: <span id="actualPriceValue">
replace_string: <span id="actualPriceValue"><br />Price:
strip_id_or_class: collapsePS
strip_id_or_class: expandPS
strip_id_or_class: psPlaceHolde
strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')]
test_url: http://www.amazon.com/Common-Sense-Forestry-Living-Mother/dp/1931498210/

View File

@ -0,0 +1,6 @@
title: //div[@class='head']/h2/a
author: //div[@class='head']/a
date: //div[@class='head']/p[@class='date']/a
body: //div[@class='copy']
strip: //p[@class='meta']
test_url: http://americandrink.net/post/10567188712/free-the-hooch

View File

@ -0,0 +1,10 @@
title: //div[@class="editorial-content"]/h3
body: //div[@class="hero-image" or @class="editorial-content"]
strip: //ul[@class="hero-caption"]
strip_id_or_class: footer
prune: no
tidy: no
test_url: http://www.americascup.com/en/Latest/News/2012/3/Coutts-and-Peyron-tell-transformative-tale-at-Global-Sports-Forum/

View File

@ -0,0 +1,5 @@
title: //h1[@class="post-title"]
author: //span[@class="author"]/a
date: //span[@class="date"]
body: //div[@class="post-content main"]
test_url: http://www.americastestkitchenfeed.com/gadgets-and-gear/2012/07/chill-out-with-tovolos-king-cube-silicone-ice-cube-tray/

View File

@ -0,0 +1,11 @@
author: //a[@class='b'][1]
date: substring-after(substring-before(//div, 'Posted in'), ' on ')
strip_image_src: /content/images/globals/
strip: //h2[. = 'Page 1']/preceding::p
strip: //h2
prune: no
single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/'))
test_url: http://www.anandtech.com/show/5812/eurocom-monster-10-clevos-little-monster/

View File

@ -0,0 +1,9 @@
title: //h2
author: string('Andy Rutledge')
date: //div[@class='articledate']
body: //div[@class='copybody']
strip: //*[@class='space']
strip: //*[@class='articleFoot']
test_url: http://www.andyrutledge.com/hungry-for-a-better-menu.php

View File

@ -0,0 +1,9 @@
title: //h1[@class="title"]
author: ("Anna Manasova")
# is ignored, unfortunately
date: //p[@class="date"]
body: //div[@class="entry"]
test_url: http://annatravelling.wordpress.com/2011/11/07/a-day-of-cooking-thai/

View File

@ -0,0 +1,18 @@
title: //h1[contains(@class, 'title')#
body: //div[@id='mainContent']//div[contains(@class, 'section_content')] | //ul[@class='section_footer']
date: //div[@class='date']
strip_id_or_class: sharethis
strip_id_or_class: stats
strip_id_or_class: apply_form
strip_id_or_class: job_map
strip_id_or_class: respond
strip: //h1//span[@class='type']
strip: //li[@class='print' or @class='map']
replace_string(<ul class="section_footer" style="display): <ul class="section_footer" style="display-bla
prune: no
tidy: no
test_url: http://applature.com/mining-jobs/jobs/nickel-west-leinster-analytical-laboratory-technician/

View File

@ -0,0 +1,7 @@
strip: //p[@class='sosumi']
# Aren't they witty?
# I can't work out what causes the  before the title.
title: //h1[@class='title']
strip: //h1[@class='title']
test_url: http://www.apple.com/pr/library/2011/02/15appstore.html

View File

@ -0,0 +1,11 @@
title: //p[@class='title']
author: //p[text() = 'By ']/a/text()
strip: //p[text() = 'By ']
body: //td[@class='bod']
strip_id_or_class: title
strip_id_or_class: minor
strip_id_or_class: multipagefooter
test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html

View File

@ -0,0 +1,2 @@
body: //*[(@class = "historia")]
test_url: http://appleweblog.com/2011/09/encontrada-vulnerabilidad-grave-en-skype-para-ios

View File

@ -0,0 +1,5 @@
date: //div[@class='post_date']
body: //div[@class='post_content']
test_url: http://www.archdaily.com/185325/p10-mixed-use-building-studio-up

View File

@ -0,0 +1,18 @@
# Description: Fix XPaths to include ALL chapters on 'view_full_work' pages.
# Include: work meta, summary, chapter information, and notes which Instapaper strips out on default.
# Exclude: header, footer, navigation, comments.
# Notes: User is a newbie with XPaths.
title: //h2[@class='title']
author: //h3[@class='byline']
author: //a[@class='login author']
strip_id_or_class:header
strip_id_or_class:navigation
strip_id_or_class:feedback
strip_id_or_class:kudos
strip_id_or_class:add_comment_placeholder
strip_id_or_class:add_comment
strip_id_or_class:globalize
strip_id_or_class:footer
test_url: http://archiveofourown.org/works/229402?view_full_work=true

View File

@ -0,0 +1,16 @@
author: //p[@class='byline']/a
body: //div[contains(@class,'article-content')]
strip: //h2[@class='title']
strip_id_or_class: byline
prune: no
date: //div[@class='byline']/span[@class='posted']//abbr/@original-title
date: //div[@class='byline']/span[@class='posted']//abbr
title: //div[@id='story']//h2[@class='title']
strip: //div[@class='pager']
next_page_link: //nav//a[span/@class='next']/@href
test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars
test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/

View File

@ -0,0 +1,6 @@
title: //div[@class="mod-bostonarticleheader mod-articleheader"]/h1
author: substring-after(//div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[3],"By ")
date: //div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[@class="pubdate"]
strip_id_or_class: mod-pagination
test_url: http://articles.boston.com/2011-10-23/news/30313691_1_bigfoot-free-speech-monadnock-state-park

View File

@ -0,0 +1,11 @@
title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1
date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"]
author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3]
strip_id_or_class: mod-article-byline
strip_id_or_class: mod-article-header
strip_id_or_class: mod-article-subtitle
#This leaves some crud after the article, but it's better than nothing.
#It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element.
test_url: http://articles.courant.com/2011-10-22/news/hc-green-drugsearch--1022-20111022_1_drugs-in-student-lockers-police-dogs-lockdown

View File

@ -0,0 +1,3 @@
body: //div[@id='HeadLine']
strip: //div[@id='utility_right']
test_url: http://www.asahi.com/culture/update/0520/TKY201105200321.html

View File

@ -0,0 +1,5 @@
title: //h1[@class='article_title']
author: //span[@class='author']
date: //h2[@class='dateline']
body: //div[@class='article_body']
test_url: http://ascarter.net/2012/02/20/enough-is-enough.html

View File

@ -0,0 +1,7 @@
title: //span[@class='titel']
author: //span[@class='metadaten_C']/a//span[@class='metadaten_C']
date: substring-after(//span[@class='metadaten_C'],'astronews.com')
strip: //span[@class='bu']
strip_image_src: '/_images/'
test_url: http://www.astronews.com/news/artikel/2011/10/1110-021.shtml

View File

@ -0,0 +1,8 @@
# Johannes Stühler
title://h2
author://span[@class='meta-content']
date://abbr[@class='date published']/@title
body://div[@class='entry-content']
test_url: http://www.asymco.com/2011/01/14/is-android-more-efficient-than-ios-at-generating-search-revenue/

View File

@ -0,0 +1,6 @@
prune: no
body: //div[@class='post-body']
author: //p[@class='byline']//a
date: substring-after(//div[@class='about']/p[2], 'Posted')
strip: //div[@class='body']/div[@class='meta']
test_url: http://www.autoblog.com/2012/01/17/next-gen-bmw-x5-caught-again/

View File

@ -0,0 +1,4 @@
author: //*[@id="article_wrapper"]/div[1]/a[1]
body: //*[@id="article_wrapper"]/div[2]
date: //*[@id="article_wrapper"]/div[1]/text()[2]
test_url: http://www.avclub.com/articles/forgetmenot,70904

View File

@ -0,0 +1,12 @@
single_page_link: //div[@class='toppaginate']//a[@rel='nofollow']
convert_double_br_tags: yes
title: //div[@class="story"]/h1
body: //div[@id="story-body-text"]
author: //span[@class="byline"]
date: //p[@class="date"]
strip: //*[@class='all']
strip: //*[@class='articlerail']
test_url: http://www.baltimoresun.com/news/maryland/bs-md-omalley-budget-2-20120116,0,5340585.story

View File

@ -0,0 +1,7 @@
title: //h2
date: //span[@class='date']
body: //div[@class='entry']
strip: //div[@class='zusatz']
test_url: http://www.basicthinking.de/blog/2011/12/13/sagt-social-networks-adieu-begrust-private-networks/

View File

@ -0,0 +1,13 @@
author: substring(//h3[@class='headlines']/span[@class='dates'],0,string-length(//h3[@class='headlines']/span[@class='dates'])-20)
date: substring((//h3[@class='headlines']/span[@class='dates']),string-length(//h3[@class='headlines']/span[@class='dates'])-18,12)
body: //div[@class='first-article-big']
strip: //table[@class='newsimagecontainer']
strip: //h3[@class='headlines']
strip: //iframe[@class='headlines']
strip: //a[@class='newslink']
convert_double_br_tags: yes
test_url: http://bb.is/Pages/82?NewsID=174119

View File

@ -0,0 +1,32 @@
body: //div[@class="story-body"]
title: //h1[@class="story-header"]
date: //span[@class="story-date"]/span[@class='date']
# recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055
body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1']
#strip: //div[@class="story-feature narrow"]
#strip: //div[@class="story-feature wide"]
#strip: //div[@class="story-feature dslideshow-enclosure"]
strip: //div[contains(@class, "story-feature")]
strip: //span[@class="story-date"]
#strip: //div[@class="caption body-narrow-width"]
strip: //div[@class="warning"]//p
strip: //div[@id='page-bookmark-links-head']
strip: //object
strip: //div[contains(@class, "bbccom_advert_placeholder")]
strip: //div[contains(@class, "embedded-hyper")]
strip: //div[contains(@class, 'market-data')]
strip: //a[contains(@class, 'hidden')]
strip: //div[contains(@class, 'hypertabs')]
strip: //div[contains(@class, 'related')]
strip: //form[@id='comment-form']
strip: //div[contains(@class, 'comment-introduction')]
replace_string(<noscript>): <div>
replace_string(</noscript>): </div>
prune: no
dissolve: //h2
test_url: http://www.bbc.co.uk/news/business-15060862

View File

@ -0,0 +1,16 @@
body: //div[@class="entry-content"]
# Remove text &lsquo;Tweet&rsquo;
strip: //div[@class="entry-content"]/div[last()]
title: h1[@class="entry-title"]
# If the Instapaper text parser worked with HTML5 tags, we would use:
date: //time[@class="entry-date"]
# But since it does not, use this more complicated rule:
date: //div[@class="entry-meta"]/a[@rel="bookmark"]
# Unfortunately, the following rule is overridden by the automatically found author.
author: ("Benoit Maison")
test_url: http://www.benoitmaison.org/2011/12/06/why-siri-had-to-start-in-beta/

View File

@ -0,0 +1,3 @@
title: //h1[@class='headline']
body: //div[contains(@class, 'article-wrapper')]
test_url: http://www.berlingske.dk/danmark/festen-er-flyttet-nordpaa

View File

@ -0,0 +1,2 @@
body: //div[@class="entry-content"]
test_url: http://www.betabeat.com/2011/07/04/sheryl-sandberg-breaks-through-silicon-valleys-boys-club-sort-of/

View File

@ -0,0 +1,7 @@
# some articles at this site like this one doesn't
# seem to pick up the article body via normal
# processing, other articles come through fine
# http://www.betanews.com/joewilcox/article
# /Google-is-a-marketing-sensation/1309708375
body: //*[@id="article"]
test_url: http://www.betanews.com/joewilcox/article/Google-is-a-marketing-sensation/1309708375

View File

@ -0,0 +1,8 @@
title: //div[contains(@class, 'main-content')]//h1
body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')]
prune: no
single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')]
test_url: http://www.biography.com/print/profile/martin-luther-9389283

View File

@ -0,0 +1,2 @@
body: //*[(@class = "historia")]
test_url: http://bitelia.com/2011/09/klout-midiendo-influencia

View File

@ -0,0 +1,7 @@
title: //h1[@class='articlehead']
body: //div[@class='column']
strip: //h1
strip: //div[@class='help']
#no author or date/time provided in current layout
test_url: http://bjango.com/articles/actions/

View File

@ -0,0 +1,8 @@
tidy: no
prune: no
date: //article/header/h6/time
title: //article/header/h3
author: //meta[@name='author']/@content
body: //article//post
test_url: http://blog.arsln.org/aska-ayip-oluyor/

View File

@ -0,0 +1,7 @@
title: //title
author: //span[@class='author vcard']/a
date: //p[@class='headline_meta']/abbr[@class='published']
body: //div[@class='format_text entry-content']
strip: //div[@id='dd_ajax_float']
test_url: http://blog.asmartbear.com/how-to-get-quality-freelance-graphics-design-work-on-a-budget.html

View File

@ -0,0 +1,9 @@
# Instapaper gets this back to front and only gets the blog title instead of the article title.
title: substring-before(//title, '-')
author: //a[ contains(@href, '/people') ]
body: //div[ @class='post' ]
# Date is impossible to retrieve since they use those stupid "fuzzy" dates, inserted through javascript, at posterous.
test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n

View File

@ -0,0 +1,5 @@
title: //h2
date: //h3
body: //ul
test_url: http://blog.fefe.de/?ts=b063bf55

View File

@ -0,0 +1,11 @@
# clean Instagram blog a little bit
tidy:no
prune:no
body://div[contains(@id,'content')]
strip_id_or_class:meta
strip_id_or_class:notes
strip_id_or_class:pagination
test_url: http://blog.instagram.com/post/8757832007/fromwhereistand

View File

@ -0,0 +1,4 @@
date: //span[contains(@class, 'date-links')]
author: //span[contains(@class, 'author-links')]
body: //div[contains(@class, 'entry-content')]
test_url: http://blog.jaysalvat.com/article/celui-qui-avait-refait-son-site-web

View File

@ -0,0 +1,5 @@
body: //*[contains(@class, 'post_content')]
author: string('Kaelig Deloumeau-Prigent')
title: //h1[@class='title']
date: //span[@class='date']
test_url: http://blog.kaelig.fr/post/24877648508/preprocesseurs-css-renoncer-par-choix-ou-par

View File

@ -0,0 +1,6 @@
title: //span[@class='pcol1 itemSubjectBoldfont']
body: //div[@id='postListBody']
date: //p[@class='date fil5 pcol2']
single_page_link: /html/frameset/frame[1]/attribute::src
strip: //div[@class='post-btn']
test_url: http://blog.naver.com/how2invest/110135068757

View File

@ -0,0 +1,12 @@
# PCHOME blog, a popular Chinese blog host
# Oct 15, 2011
#
title://*[contains(@class,'imp')]/h2
date://*[contains(@class,'imp')]/span
body://div[contains(@id,'blog_content')]
test_url: http://blog.pchome.net/article/462502.html

View File

@ -0,0 +1,6 @@
title: //a[@class="blog_title"]
date: //p[@class="when"]/a
body: //div[@class="blog_entry"]
strip_id_or_class:blog_title
strip_id_or_class:when
test_url: http://blog.pinboard.in/2011/11/the_social_graph_is_neither/

View File

@ -0,0 +1,26 @@
# Sina blog, the most popular blog host in China.
# Its source code is horrible.
#
# Issue:
# Only the first image in the article is displayed.
# The rest images are replace by a 1x1 transparent gif by sina blog host.
#
title://*[contains(@class,'titName SG_txta')]
author://*[contains(@id,'ownernick')]
date://*[contains(@class,'time SG_txtc')]
body://div[contains(@class,'articalContent')]
# Remove redundant content which has span class start with "MASS"
# Example <span class="MASSf21674ffeef7"></span>
strip://span[contains(@class,'MASS')]
# Remove comment
strip://div[contains(@class,'allComm')]
# Remove hiden text and link
strip://ins
tidy:no
convert_double_br_tags:yes
test_url: http://blog.sina.com.cn/s/blog_5054769e0102dtja.html

View File

@ -0,0 +1,2 @@
body://div[@class='post']
test_url: http://blog.spu.edu/lectio/from-the-frying-pan-into-the-fire/

View File

@ -0,0 +1,6 @@
title: //h2/a[@class="no-link title"]
author: //h2[@id="blog_owner"]
date: //time
strip: //h2/a[@class="no-link title"]
test_url: http://blog.wells.ee/retina
test_url: http://blog.wells.ee/skeuomorphism

View File

@ -0,0 +1,8 @@
# 2011-08-23 [carlo@...] Initial version.
author: //div[@id="blogauthordatebox-node"]//a[@title="View user profile."]/text()
# why yes, I do feel a bit dirty
date: substring-before( substring-after( substring-after( //div[@id="blogauthordatebox-node"]//td[3], "on " ), ", "), " " )
test_url: http://blogs.aljazeera.net/asia/2011/08/22/peoples-hero

View File

@ -0,0 +1,2 @@
body: //div[@class='entry']
test_url: http://blogs.forbes.com/adamhartung/2011/04/08/apple-is-better-managed-than-microsoft/

View File

@ -0,0 +1,4 @@
title: //div[@id='pageFeature']/h1
body: //div[@id='articleBody']
strip: //div[@class='module wide']
test_url: http://blogs.hbr.org/bregman/2011/04/the-1-killer-of-meetings-and-w.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+harvardbusiness+%28HBR.org%29

View File

@ -0,0 +1,6 @@
title: //h3[@class="post-name"]
author: //span[@class="user-name"]
date: //div[@class="post-date"]
body: //div[@class="post-content user-defined-markup"]
footnotes: no
test_url: http://blogs.msdn.com/b/b8/archive/2011/10/04/designing-the-start-screen.aspx

View File

@ -0,0 +1,3 @@
title: //div[@id='single']/h1
body: //div[@id='postcontent']
test_url: http://blogs.reuters.com/felix-salmon/2010/07/16/the-value-of-a-strong-brand-apple-edition/

View File

@ -0,0 +1,16 @@
# meta data
title://h1[@class = 'postTitle']
author:substring-before(substring-after(//span[@class = 'byline'],'By '),'|')
date://span[@class = 'datestamp']
#body content
body://div[@id = 'singleBlogPost']
#reclaim author info
move_into(//div[@id = 'singleBlogPost'])://div[@id = 'aboutAuthorDiv']
strip://p[@class = 'moreLink mobileHide']
#cleanup comments, there might be some open <div> sections
strip://div[@id = 'comments2']
strip://h3[a[@href = '#add-comment']]
test_url: http://blogs.scientificamerican.com/a-blog-around-the-clock/2012/07/10/science-blogs-definition-and-a-history/

View File

@ -0,0 +1,15 @@
# metadata
author://div[@class = 'post']/div[@class='meta']/a[1]
date://div[@id = 'rap']/h2[1]
body://div[@class = 'post']
# wrapping caption and image
wrap_in(fieldset)://div[contains(@class, 'wp-caption')]
# clean up
strip://div[@class = 'post']/h3[@class = 'storytitle']
strip://div[@class = 'post']/div[@class = 'social']
strip://img[@style = 'display:none;']
strip://img[@height='0' and @width='0']
test_url: http://blogs.smithsonianmag.com/adventure/2011/10/tips-for-women-traveling-in-turkey/

View File

@ -0,0 +1,6 @@
title: //h3[@class="post-name"]
author: //span[@class="user-name"]
date: //div[@class="post-date"]
body: //div[@class="post-content user-defined-markup"]
footnotes: no
test_url: http://blogs.technet.com/b/dlemson/archive/2004/03/03/83304.aspx

View File

@ -0,0 +1,4 @@
body://div[@class='entry']
date://div[@class='meta']
strip://a[@class='FlattrButton']
test_url: http://bluetouff.com/2012/03/02/polemique-google-vie-privee/

View File

@ -0,0 +1,8 @@
title: //h1[@class="entry-title"][2]
author: string("Paul Boag")
date: substring(//span[@class="meta"], 11)
body: //article
strip: //h2
strip: //h1
strip: //div[@id="callsToAction"]
test_url: http://boagworld.com/working-in-web-design/dealing-with-the-dickheads/

View File

@ -0,0 +1,11 @@
# This is far from perfect, but so is BoingBoing's markup
title: //h2[@class="headline"]
single_page_link: //h2[@class="headline"]/a
#date: //p[@class="byline"]
body: //div[@class="post"]
strip_id_or_class: shareMe
strip_id_or_class: authorbox
strip_id_or_class: byline
test_url: http://boingboing.net/2011/10/23/understanding-the-hyperrich-through-the-lens-of-tomorrows-history.html

View File

@ -0,0 +1,3 @@
title: //h2[@class='entry-title']
body: //div[@class='entry-content']
test_url: http://boldizsar.palotas.eu/blog/?p=1394

View File

@ -0,0 +1,6 @@
body: //span[@property='v:description']
date: //span[@property='v:dtreviewed']
author: //span[@property='v:reviewer']
prune: no
test_url: http://book.douban.com/review/2422662/

View File

@ -0,0 +1,19 @@
#metadata
title://div[@class = 'Topper']/h1
author://div[@class = 'Topper']/h3
date://div[@class = 'Topper']/h6
body://div[@class = 'Core']
# clean up
strip://div[@class = 'Topper']/h1
strip://div[@class = 'Topper']/h3
strip://div[@class = 'Topper']/h4
strip://div[@class = 'Topper']/h5
strip://div[@class = 'Topper']/h6
strip://br[@clear = 'all']
strip://div[@class = 'adCore']
strip://div[@class = 'BookR']
strip://div[@class = 'InfoBox']
test_url: http://bookforum.com/inprint/018_04/8595

View File

@ -0,0 +1,7 @@
title://h1
author://div[@class="meta"]/span/a
date://div[@class="date"]
body://div[@class="content article"]
strip://div[@class="content article"]/h1
test_url: http://borderhouseblog.com/?p=7832

View File

@ -0,0 +1,16 @@
# NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com.
title: //div[@class="header"]/h1
author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ")
date: //div[@class="byline"]/p[last()]
body: //div[@class="article-body"]
strip_id_or_class: aside
strip_id_or_class: promo
strip_id_or_class: skip-nav
strip_id_or_class: article-more
strip_id_or_class: article-bar
# This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed.
strip_id_or_class: figure
test_url: http://bostonglobe.com/news/nation/2012/03/17/illinois-primary-could-pivotal/PsDzFZqvhEYyXbOcF9FOkO/story.html

View File

@ -0,0 +1,15 @@
#basics
title://h3[@class = 'article_title']
date://span[@class = 'article_date']
body://div[@id = 'center_column_article']
#correct, but author not being picked up in preview
author://span[@class = 'article_author']
#strips basics from article
strip_id_or_class:article_title
strip_id_or_class:article_date
strip_id_or_class:article_author
#strips pull quotes
strip_id_or_class:pull_quote
test_url: http://www.bostonreview.net/BR36.4/megan_pugh_agnes_de_mille_dance.php

View File

@ -0,0 +1,5 @@
title: substring-before(//title, '|')
body: //div[@class="entry"]
# Remove the author's picture
strip: //div[@class="entry"]/a[1]
test_url: http://www.boundlessline.org/2011/06/the-nyts-on-gender-over-the-weekend.html

View File

@ -0,0 +1,10 @@
title: //div[@class="standard"]/h1
author: string("BrainFacts.org")
date: //div[@class="meta"]/strong
strip: //p[@class="skip"]
strip: //div[@class="meta"]
strip: //div[@class="standard"]/h1
strip: //div[@class="modal"]
strip: //div[@class="columnRight"]
test_url: http://brainfacts.org/diseases-disorders/childhood-disorders/articles/2011/autism-the-pervasive-developmental-disorder/

View File

@ -0,0 +1,7 @@
# set body
body: //div[@id='theContent']
# set title
title: //div[@id='theContent']/h3
strip: //div[@id='theContent']/h3
test_url: http://www.brandeins.de/archiv/magazin/gegessen-wird-immer/artikel/hunger.html

View File

@ -0,0 +1,3 @@
date://h2[@class="date-header"]
body://div[@class="entry-content"]
test_url: http://www.brandingstrategyinsider.com/2011/12/top-twelve-branding-keys-for-2012.html

View File

@ -0,0 +1,5 @@
body: //div[@class='post full']
title: //h1
author: substring-after(//title, '- ')
date: //span[@class='date']
test_url: http://brettterpstra.com/byword-for-ios/

View File

@ -0,0 +1,2 @@
body: //div[@class='articleBody']
test_url: http://www.brisbanetimes.com.au/opinion/blogs/blunt-instrument/losing-our-minds--for-24-hours-20120118-1q682.html

View File

@ -0,0 +1,13 @@
title: //div[@id='contentheader']/h1
author: //p[@class='attribution']/span[@class='author']/*
# Is there a way to pull multiple authors? My XPath here is just grabbing the first
date: /html/head/meta[@name="date"]/@content
body: //div[@class='main-content']
strip: //p[@class='byline']
strip: //div[@class='img-gallery']
strip: //div[@class='callout']
strip: //div[@class='add-your-view']
convert_double_br_tags: yes
test_url: http://www.brookings.edu/opinions/2011/1018_cyberattack_libya_goldsmith.aspx

Some files were not shown because too many files have changed in this diff Show More