2014-07-13 04:15:40 -04:00
|
|
|
# TODO: clean up the extra junk at the end of articles
|
|
|
|
|
|
|
|
# general text formatting
|
|
|
|
prune: no
|
|
|
|
convert_double_br_tags:yes
|
|
|
|
|
|
|
|
# where to find the basic metadata
|
|
|
|
author://a[@class='articleauthor']
|
|
|
|
date://a[starts-with(@href,'/en/search/published/')]
|
|
|
|
title:substring-before(//h2[@class='title'],'—')
|
|
|
|
body://div[@id='maincontainer']
|
|
|
|
|
|
|
|
dissolve://div[starts-with(@id,'commentableblock')]
|
|
|
|
|
|
|
|
# clean up the crap
|
|
|
|
strip://div[contains(@class,'domusnetwork')]
|
|
|
|
strip://div[contains(@class,'relative_wrapper')]
|
|
|
|
|
|
|
|
strip://div[contains(@class,'captionsubimage')]/img[contains(@class,'arrow')]
|
2013-12-06 04:13:03 -05:00
|
|
|
wrap_in(em): //div[contains(@class,'captionsubimage')]/span
|
|
|
|
test_url: http://www.domusweb.it/en/design/in-praise-of-lost-time/
|