mirror of
https://github.com/moparisthebest/SickRage
synced 2024-11-06 01:15:05 -05:00
204 lines
7.9 KiB
Python
204 lines
7.9 KiB
Python
|
"""
|
||
|
parser.http.characterParser module (imdb package).
|
||
|
|
||
|
This module provides the classes (and the instances), used to parse
|
||
|
the IMDb pages on the akas.imdb.com server about a character.
|
||
|
E.g., for "Jesse James" the referred pages would be:
|
||
|
main details: http://www.imdb.com/character/ch0000001/
|
||
|
biography: http://www.imdb.com/character/ch0000001/bio
|
||
|
...and so on...
|
||
|
|
||
|
Copyright 2007-2009 Davide Alberani <da@erlug.linux.it>
|
||
|
2008 H. Turgut Uyar <uyar@tekir.org>
|
||
|
|
||
|
This program is free software; you can redistribute it and/or modify
|
||
|
it under the terms of the GNU General Public License as published by
|
||
|
the Free Software Foundation; either version 2 of the License, or
|
||
|
(at your option) any later version.
|
||
|
|
||
|
This program is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
GNU General Public License for more details.
|
||
|
|
||
|
You should have received a copy of the GNU General Public License
|
||
|
along with this program; if not, write to the Free Software
|
||
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||
|
"""
|
||
|
|
||
|
import re
|
||
|
from utils import Attribute, Extractor, DOMParserBase, build_movie, \
|
||
|
analyze_imdbid
|
||
|
from personParser import DOMHTMLMaindetailsParser
|
||
|
|
||
|
from imdb.Movie import Movie
|
||
|
|
||
|
_personIDs = re.compile(r'/name/nm([0-9]{7})')
|
||
|
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
|
||
|
"""Parser for the "filmography" page of a given character.
|
||
|
The page should be provided as a string, as taken from
|
||
|
the akas.imdb.com server. The final result will be a
|
||
|
dictionary, with a key for every relevant section.
|
||
|
|
||
|
Example:
|
||
|
bparser = DOMHTMLCharacterMaindetailsParser()
|
||
|
result = bparser.parse(character_biography_html_string)
|
||
|
"""
|
||
|
_containsObjects = True
|
||
|
|
||
|
_film_attrs = [Attribute(key=None,
|
||
|
multi=True,
|
||
|
path={
|
||
|
'link': "./a[1]/@href",
|
||
|
'title': ".//text()",
|
||
|
'status': "./i/a//text()",
|
||
|
'roleID': "./a/@href"
|
||
|
},
|
||
|
postprocess=lambda x:
|
||
|
build_movie(x.get('title') or u'',
|
||
|
movieID=analyze_imdbid(x.get('link') or u''),
|
||
|
roleID=_personIDs.findall(x.get('roleID') or u''),
|
||
|
status=x.get('status') or None,
|
||
|
_parsingCharacter=True))]
|
||
|
|
||
|
extractors = [
|
||
|
Extractor(label='title',
|
||
|
path="//title",
|
||
|
attrs=Attribute(key='name',
|
||
|
path="./text()",
|
||
|
postprocess=lambda x: \
|
||
|
x.replace(' (Character)', '').replace(
|
||
|
'- Filmography by type', '').strip())),
|
||
|
|
||
|
Extractor(label='headshot',
|
||
|
path="//a[@name='headshot']",
|
||
|
attrs=Attribute(key='headshot',
|
||
|
path="./img/@src")),
|
||
|
|
||
|
Extractor(label='akas',
|
||
|
path="//div[h5='Alternate Names:']",
|
||
|
attrs=Attribute(key='akas',
|
||
|
path="./div//text()",
|
||
|
postprocess=lambda x: x.strip().split(' / '))),
|
||
|
|
||
|
Extractor(label='filmography',
|
||
|
path="//div[@class='filmo'][not(h5)]/ol/li",
|
||
|
attrs=_film_attrs),
|
||
|
|
||
|
Extractor(label='filmography sections',
|
||
|
group="//div[@class='filmo'][h5]",
|
||
|
group_key="./h5/a/text()",
|
||
|
group_key_normalize=lambda x: x.lower()[:-1],
|
||
|
path="./ol/li",
|
||
|
attrs=_film_attrs),
|
||
|
]
|
||
|
|
||
|
preprocessors = [
|
||
|
# Check that this doesn't cut "status"...
|
||
|
(re.compile(r'<br>(\.\.\.| ).+?</li>', re.I | re.M), '</li>')]
|
||
|
|
||
|
|
||
|
class DOMHTMLCharacterBioParser(DOMParserBase):
|
||
|
"""Parser for the "biography" page of a given character.
|
||
|
The page should be provided as a string, as taken from
|
||
|
the akas.imdb.com server. The final result will be a
|
||
|
dictionary, with a key for every relevant section.
|
||
|
|
||
|
Example:
|
||
|
bparser = DOMHTMLCharacterBioParser()
|
||
|
result = bparser.parse(character_biography_html_string)
|
||
|
"""
|
||
|
_defGetRefs = True
|
||
|
|
||
|
extractors = [
|
||
|
Extractor(label='introduction',
|
||
|
path="//div[@id='_intro']",
|
||
|
attrs=Attribute(key='introduction',
|
||
|
path=".//text()",
|
||
|
postprocess=lambda x: x.strip())),
|
||
|
|
||
|
Extractor(label='biography',
|
||
|
path="//span[@class='_biography']",
|
||
|
attrs=Attribute(key='biography',
|
||
|
multi=True,
|
||
|
path={
|
||
|
'info': "./preceding-sibling::h4[1]//text()",
|
||
|
'text': ".//text()"
|
||
|
},
|
||
|
postprocess=lambda x: u'%s: %s' % (
|
||
|
x.get('info').strip(),
|
||
|
x.get('text').replace('\n',
|
||
|
' ').replace('||', '\n\n').strip()))),
|
||
|
]
|
||
|
|
||
|
preprocessors = [
|
||
|
(re.compile('(<div id="swiki.2.3.1">)', re.I), r'\1<div id="_intro">'),
|
||
|
(re.compile('(<a name="history">)\s*(<table .*?</table>)',
|
||
|
re.I | re.DOTALL),
|
||
|
r'</div>\2\1</a>'),
|
||
|
(re.compile('(<a name="[^"]+">)(<h4>)', re.I), r'</span>\1</a>\2'),
|
||
|
(re.compile('(</h4>)</a>', re.I), r'\1<span class="_biography">'),
|
||
|
(re.compile('<br/><br/>', re.I), r'||'),
|
||
|
(re.compile('\|\|\n', re.I), r'</span>'),
|
||
|
]
|
||
|
|
||
|
|
||
|
class DOMHTMLCharacterQuotesParser(DOMParserBase):
|
||
|
"""Parser for the "quotes" page of a given character.
|
||
|
The page should be provided as a string, as taken from
|
||
|
the akas.imdb.com server. The final result will be a
|
||
|
dictionary, with a key for every relevant section.
|
||
|
|
||
|
Example:
|
||
|
qparser = DOMHTMLCharacterQuotesParser()
|
||
|
result = qparser.parse(character_quotes_html_string)
|
||
|
"""
|
||
|
_defGetRefs = True
|
||
|
|
||
|
extractors = [
|
||
|
Extractor(label='charquotes',
|
||
|
group="//h5",
|
||
|
group_key="./a/text()",
|
||
|
path="./following-sibling::div[1]",
|
||
|
attrs=Attribute(key=None,
|
||
|
path={'txt': ".//text()",
|
||
|
'movieID': ".//a[1]/@href"},
|
||
|
postprocess=lambda x: (analyze_imdbid(x['movieID']),
|
||
|
x['txt'].strip().replace(': ',
|
||
|
': ').replace(': ', ': ').split('||'))))
|
||
|
]
|
||
|
|
||
|
preprocessors = [
|
||
|
(re.compile('(</h5>)', re.I), r'\1<div>'),
|
||
|
(re.compile('\s*<br/><br/>\s*', re.I), r'||'),
|
||
|
(re.compile('\|\|\s*(<hr/>)', re.I), r'</div>\1'),
|
||
|
(re.compile('\s*<br/>\s*', re.I), r'::')
|
||
|
]
|
||
|
|
||
|
def postprocess_data(self, data):
|
||
|
if not data:
|
||
|
return {}
|
||
|
newData = {}
|
||
|
for title in data:
|
||
|
movieID, quotes = data[title]
|
||
|
if movieID is None:
|
||
|
movie = title
|
||
|
else:
|
||
|
movie = Movie(title=title, movieID=movieID,
|
||
|
accessSystem=self._as, modFunct=self._modFunct)
|
||
|
newData[movie] = [quote.split('::') for quote in quotes]
|
||
|
return {'quotes': newData}
|
||
|
|
||
|
|
||
|
from personParser import DOMHTMLSeriesParser
|
||
|
|
||
|
_OBJECTS = {
|
||
|
'character_main_parser': ((DOMHTMLCharacterMaindetailsParser,),
|
||
|
{'kind': 'character'}),
|
||
|
'character_series_parser': ((DOMHTMLSeriesParser,), None),
|
||
|
'character_bio_parser': ((DOMHTMLCharacterBioParser,), None),
|
||
|
'character_quotes_parser': ((DOMHTMLCharacterQuotesParser,), None)
|
||
|
}
|
||
|
|
||
|
|