1
0
mirror of https://github.com/moparisthebest/wallabag synced 2024-11-23 17:42:15 -05:00
wallabag/inc/3rdparty/site_config/standard/bostonglobe.com.txt
2014-07-13 10:15:40 +02:00

16 lines
849 B
Plaintext
Executable File

# NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com.
title: //div[@class="header"]/h1
author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ")
date: //div[@class="byline"]/p[last()]
body: //div[@class="article-body"]
strip_id_or_class: aside
strip_id_or_class: promo
strip_id_or_class: skip-nav
strip_id_or_class: article-more
strip_id_or_class: article-bar
# This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed.
strip_id_or_class: figure
test_url: http://bostonglobe.com/news/nation/2012/03/17/illinois-primary-could-pivotal/PsDzFZqvhEYyXbOcF9FOkO/story.html