Update imdbpy libs to v5.0

Fixed invalid indexer id issues for TVRage shows.

Fixed issues for getting posters and backdrops for TVRage shows.

We now convert XML straight to a dict object for Indexer APIs, improved overall performance api's

Fixed issues with TVRage shows and displaying genre's properly.
This commit is contained in:
echel0n 2014-05-28 22:40:12 -07:00
parent 764cf6e62e
commit 2dcd26e69c
30 changed files with 7446 additions and 453 deletions

View File

@ -6,7 +6,7 @@ a person from the IMDb database.
It can fetch data through different media (e.g.: the IMDb web pages,
a SQL database, etc.)
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2014 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -25,7 +25,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
'available_access_systems']
__version__ = VERSION = '4.9'
__version__ = VERSION = '5.0'
# Import compatibility module (importing it is enough).
import _compat
@ -160,6 +160,7 @@ def IMDb(accessSystem=None, *arguments, **keywords):
kwds.update(keywords)
keywords = kwds
except Exception, e:
import logging
logging.getLogger('imdbpy').warn('Unable to read configuration' \
' file; complete error: %s' % e)
# It just LOOKS LIKE a bad habit: we tried to read config
@ -303,7 +304,7 @@ class IMDbBase:
# http://akas.imdb.com/keyword/%s/
imdbURL_keyword_main=imdbURL_base + 'keyword/%s/'
# http://akas.imdb.com/chart/top
imdbURL_top250=imdbURL_base + 'chart/top',
imdbURL_top250=imdbURL_base + 'chart/top'
# http://akas.imdb.com/chart/bottom
imdbURL_bottom100=imdbURL_base + 'chart/bottom'
# http://akas.imdb.com/find?%s
@ -824,22 +825,23 @@ class IMDbBase:
# subclass, somewhere under the imdb.parser package.
raise NotImplementedError('override this method')
def _searchIMDb(self, kind, ton):
def _searchIMDb(self, kind, ton, title_kind=None):
"""Search the IMDb akas server for the given title or name."""
# The Exact Primary search system has gone AWOL, so we resort
# to the mobile search. :-/
if not ton:
return None
ton = ton.strip('"')
aSystem = IMDb('mobile')
if kind == 'tt':
searchFunct = aSystem.search_movie
check = 'long imdb canonical title'
check = 'long imdb title'
elif kind == 'nm':
searchFunct = aSystem.search_person
check = 'long imdb canonical name'
check = 'long imdb name'
elif kind == 'char':
searchFunct = aSystem.search_character
check = 'long imdb canonical name'
check = 'long imdb name'
elif kind == 'co':
# XXX: are [COUNTRY] codes included in the results?
searchFunct = aSystem.search_company
@ -852,24 +854,42 @@ class IMDbBase:
# exact match.
if len(searchRes) == 1:
return searchRes[0].getID()
title_only_matches = []
for item in searchRes:
# Return the first perfect match.
if item[check] == ton:
return item.getID()
if item[check].strip('"') == ton:
# For titles do additional check for kind
if kind != 'tt' or title_kind == item['kind']:
return item.getID()
elif kind == 'tt':
title_only_matches.append(item.getID())
# imdbpy2sql.py could detected wrong type, so if no title and kind
# matches found - collect all results with title only match
# Return list of IDs if multiple matches (can happen when searching
# titles with no title_kind specified)
# Example: DB: Band of Brothers "tv series" vs "tv mini-series"
if title_only_matches:
if len(title_only_matches) == 1:
return title_only_matches[0]
else:
return title_only_matches
return None
def title2imdbID(self, title):
def title2imdbID(self, title, kind=None):
"""Translate a movie title (in the plain text data files format)
to an imdbID.
Try an Exact Primary Title search on IMDb;
return None if it's unable to get the imdbID."""
return self._searchIMDb('tt', title)
return None if it's unable to get the imdbID;
Always specify kind: movie, tv series, video game etc. or search can
return list of IDs if multiple matches found
"""
return self._searchIMDb('tt', title, kind)
def name2imdbID(self, name):
"""Translate a person name in an imdbID.
Try an Exact Primary Name search on IMDb;
return None if it's unable to get the imdbID."""
return self._searchIMDb('tt', name)
return self._searchIMDb('nm', name)
def character2imdbID(self, name):
"""Translate a character name in an imdbID.
@ -896,7 +916,8 @@ class IMDbBase:
imdbID = aSystem.get_imdbMovieID(mop.movieID)
else:
imdbID = aSystem.title2imdbID(build_title(mop, canonical=0,
ptdf=1))
ptdf=0, appendKind=False),
mop['kind'])
elif isinstance(mop, Person.Person):
if mop.personID is not None:
imdbID = aSystem.get_imdbPersonID(mop.personID)

View File

@ -29,7 +29,7 @@
[imdbpy]
## Default.
accessSystem = mobile
accessSystem = http
## Optional (options common to every data access system):
# Activate adult searches (on, by default).
@ -37,7 +37,7 @@ accessSystem = mobile
# Number of results for searches (20 by default).
#results = 20
# Re-raise all caught exceptions (off, by default).
reraiseExceptions = on
#reraiseExceptions = off
## Optional (options common to http and mobile data access systems):
# Proxy used to access the network. If it requires authentication,
@ -69,7 +69,7 @@ reraiseExceptions = on
## Set the threshold for logging messages.
# Can be one of "debug", "info", "warning", "error", "critical" (default:
# "warning").
loggingLevel = info
#loggingLevel = debug
## Path to a configuration file for the logging facility;
# see: http://docs.python.org/library/logging.html#configuring-logging

View File

@ -64,8 +64,10 @@ LANG_ARTICLES = {
'English': ('the', 'a', 'an'),
'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'",
'uno'),
'Spanish': ('la', 'le', 'el', 'les', 'un', 'los', 'una', 'uno', 'unos',
'unas'),
'Spanish': ('la', 'lo', 'el', 'las', 'un', 'los', 'una', 'al', 'del',
'unos', 'unas', 'uno'),
'French': ('le', "l'", 'la', 'les', 'un', 'une', 'des', 'au', 'du', '\xc3\xa0 la',
'de la', 'aux'),
'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'),
'Turkish': (), # Some languages doesn't have articles.
}

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python
"""
generatepot.py script.

1303
lib/imdb/locale/imdbpy-ar.po Normal file

File diff suppressed because it is too large Load Diff

1303
lib/imdb/locale/imdbpy-bg.po Normal file

File diff suppressed because it is too large Load Diff

1303
lib/imdb/locale/imdbpy-de.po Normal file

File diff suppressed because it is too large Load Diff

1304
lib/imdb/locale/imdbpy-es.po Normal file

File diff suppressed because it is too large Load Diff

1304
lib/imdb/locale/imdbpy-fr.po Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
"""Generate binary message catalog from textual translation description.

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python
"""
rebuildmo.py script.

View File

@ -104,15 +104,24 @@ PY_VERSION = sys.version_info[:2]
# The cookies for the "adult" search.
# Please don't mess with these account.
# Old 'IMDbPY' account.
_old_cookie_id = 'boM2bYxz9MCsOnH9gZ0S9QHs12NWrNdApxsls1Vb5/NGrNdjcHx3dUas10UASoAjVEvhAbGagERgOpNkAPvxdbfKwaV2ikEj9SzXY1WPxABmDKQwdqzwRbM+12NSeJFGUEx3F8as10WwidLzVshDtxaPIbP13NdjVS9UZTYqgTVGrNcT9vyXU1'
_old_cookie_uu = '3M3AXsquTU5Gur/Svik+ewflPm5Rk2ieY3BIPlLjyK3C0Dp9F8UoPgbTyKiGtZp4x1X+uAUGKD7BM2g+dVd8eqEzDErCoYvdcvGLvVLAen1y08hNQtALjVKAe+1hM8g9QbNonlG1/t4S82ieUsBbrSIQbq1yhV6tZ6ArvSbA7rgHc8n5AdReyAmDaJ5Wm/ee3VDoCnGj/LlBs2ieUZNorhHDKK5Q=='
# New 'IMDbPYweb' account.
_cookie_id = 'rH1jNAkjTlNXvHolvBVBsgaPICNZbNdjVjzFwzas9JRmusdjVoqBs/Hs12NR+1WFxEoR9bGKEDUg6sNlADqXwkas12N131Rwdb+UQNGKN8PWrNdjcdqBQVLq8mbGDHP3hqzxhbD692NQi9D0JjpBtRaPIbP1zNdjUOqENQYv1ADWrNcT9vyXU1'
_cookie_uu = 'su4/m8cho4c6HP+W1qgq6wchOmhnF0w+lIWvHjRUPJ6nRA9sccEafjGADJ6hQGrMd4GKqLcz2X4z5+w+M4OIKnRn7FpENH7dxDQu3bQEHyx0ZEyeRFTPHfQEX03XF+yeN1dsPpcXaqjUZAw+lGRfXRQEfz3RIX9IgVEffdBAHw2wQXyf9xdMPrQELw0QNB8dsffsqcdQemjPB0w+moLcPh0JrKrHJ9hjBzdMPpcXTH7XRwwOk='
_IMDbPY_cookie_id = 'boM2bYxz9MCsOnH9gZ0S9QHs12NWrNdApxsls1Vb5/NGrNdjcHx3dUas10UASoAjVEvhAbGagERgOpNkAPvxdbfKwaV2ikEj9SzXY1WPxABmDKQwdqzwRbM+12NSeJFGUEx3F8as10WwidLzVshDtxaPIbP13NdjVS9UZTYqgTVGrNcT9vyXU1'
_IMDbPY_cookie_uu = '3M3AXsquTU5Gur/Svik+ewflPm5Rk2ieY3BIPlLjyK3C0Dp9F8UoPgbTyKiGtZp4x1X+uAUGKD7BM2g+dVd8eqEzDErCoYvdcvGLvVLAen1y08hNQtALjVKAe+1hM8g9QbNonlG1/t4S82ieUsBbrSIQbq1yhV6tZ6ArvSbA7rgHc8n5AdReyAmDaJ5Wm/ee3VDoCnGj/LlBs2ieUZNorhHDKK5Q=='
# 'imdbpy2010' account.
_imdbpy2010_cookie_id = 'QrCdxVi+L+WgqOLrQJJgBgRRXGInphxiBPU/YXSFDyExMFzCp6YcYgSVXyEUhS/xMID8wqemHGID4DlntwZ49vemP5UXsAxiJ4D6goSmHGIgNT9hMXBaRSF2vMS3phxB0bVfQiQlP1RxdrzhB6YcRHFASyIhQVowwXCKtDSlD2YhgRvxBsCKtGemHBKH9mxSI='
_imdbpy2010_cookie_uu = 'oiEo2yoJFCA2Zbn/o7Z1LAPIwotAu6QdALv3foDb1x5F/tdrFY63XkSfty4kntS8Y8jkHSDLt3406+d+JThEilPI0mtTaOQdA/t2/iErp22jaLdeVU5ya4PIREpj7HFdpzhEHadcIAngSER50IoHDpD6Bz4Qy3b+UIhE/hBbhz5Q63ceA2hEvhPo5B0FnrL9Q8jkWjDIbA0Au3d+AOtnXoCIRL4Q28c+UOtnXpP4RL4T6OQdA+6ijUCI5B0AW2d+UOtnXpPYRL4T6OQdA8jkTUOYlC0A=='
# old 'IMDbPYweb' account.
_old_IMDbPYweb_cookie_id = 'rH1jNAkjTlNXvHolvBVBsgaPICNZbNdjVjzFwzas9JRmusdjVoqBs/Hs12NR+1WFxEoR9bGKEDUg6sNlADqXwkas12N131Rwdb+UQNGKN8PWrNdjcdqBQVLq8mbGDHP3hqzxhbD692NQi9D0JjpBtRaPIbP1zNdjUOqENQYv1ADWrNcT9vyXU1'
_old_IMDbPYweb_cookie_uu = 'su4/m8cho4c6HP+W1qgq6wchOmhnF0w+lIWvHjRUPJ6nRA9sccEafjGADJ6hQGrMd4GKqLcz2X4z5+w+M4OIKnRn7FpENH7dxDQu3bQEHyx0ZEyeRFTPHfQEX03XF+yeN1dsPpcXaqjUZAw+lGRfXRQEfz3RIX9IgVEffdBAHw2wQXyf9xdMPrQELw0QNB8dsffsqcdQemjPB0w+moLcPh0JrKrHJ9hjBzdMPpcXTH7XRwwOk='
# old 'IMDbPYweb' account values (as of 2012-12-30)
_IMDbPYweb_cookie_id = 'BCYjtpb46Go0cMHAMewWZEauhwqPL7ASCPpPVNutu6BuayHZd0U6Dk3UAqVlEM8DHLDsSr02RGQn5ff3245-R4A130NAWJ_5yqXx7X-zJey8vQM8JKdv3rTUSEJznJQlojUW1Bije-Q0FXAixs4I0sePWhd_tA41i-9AF2q3lPmaksram6ilMhN9i3IPESW1PMbk'
_IMDbPYweb_cookie_uu = 'BCYttQjEMc-NyUdFUGxThidAnBo7wwalEzj4un9uzf2XoEjtqDhNfrH7bOSuwlRkMEQ11SNyTajl-b9Q-21m4HwYu0e3jXZrjYLXLYzFkrEroCDyUREqaTwPJPSjGtFmvlaVBZEZmsWpaxe18DT5KiygKyGPZKH78Xu4im6ba-Sd31WvbXHzP8KGXPpGjhhVuv7Dcv314HCWkE832Srf9ya-Uv0FdGAmYyLbIAXuxnvpYQd6oZ8-CYkSGLIqcKWdrf5S'
# 'IMDbPY2013' account
_IMDbPY2013_cookie_id = 'BCYmoyqSm2WglmOzG-SrFWSvVpxsTZOB0qEOOqmAwCBxCbaNgKOxd0DTKzUvt7t04Pya5gV2tUrpDmYxrc1Dr54DQj2UXI7QI35__M5-HI2KrbOI3PjDz6M-_U3HG8topMfN64R24tmBixoZhMYXVaEc556lf0Z4gQNJVYRANXvwytP5v1lpfeToRlu9aVJwN4kT'
_IMDbPY2013_cookie_uu = 'BCYquDS8Y2i8R1pJxS4nB77YrhjHHXeOea2Xl9KtZvE6RZKVfMvzTGU4Vl5-yxfPbgRSiFJasyf-hhPuVvXyaHlfeBjNlbFT8hz2HzFFkQ_SxKxq05J51gi7Fv4SaAws1M-i7zmQ1TRunfJqCVIYqPwIs2NO7s4_YDH2ZoISVGLgca8OY2K58HychOZB1oRWHVeAJNhLJMrCWJBuGRLCNnQK5X9tA0dPPntr2Ussy0ouul-N1GQz-8y5vda3JJ_C6xkwmHcA6JrOdOFO_HqMWjVSXuxGEdrXC919JM9H0vooVvKeVgAEJnTh2GiVlUJUoH3c'
# imdbpy2010 account.
#_cookie_id = 'QrCdxVi+L+WgqOLrQJJgBgRRXGInphxiBPU/YXSFDyExMFzCp6YcYgSVXyEUhS/xMID8wqemHGID4DlntwZ49vemP5UXsAxiJ4D6goSmHGIgNT9hMXBaRSF2vMS3phxB0bVfQiQlP1RxdrzhB6YcRHFASyIhQVowwXCKtDSlD2YhgRvxBsCKtGemHBKH9mxSI='
#_cookie_uu = 'oiEo2yoJFCA2Zbn/o7Z1LAPIwotAu6QdALv3foDb1x5F/tdrFY63XkSfty4kntS8Y8jkHSDLt3406+d+JThEilPI0mtTaOQdA/t2/iErp22jaLdeVU5ya4PIREpj7HFdpzhEHadcIAngSER50IoHDpD6Bz4Qy3b+UIhE/hBbhz5Q63ceA2hEvhPo5B0FnrL9Q8jkWjDIbA0Au3d+AOtnXoCIRL4Q28c+UOtnXpP4RL4T6OQdA+6ijUCI5B0AW2d+UOtnXpPYRL4T6OQdA8jkTUOYlC0A=='
# Currently used account.
_cookie_id = _IMDbPY2013_cookie_id
_cookie_uu = _IMDbPY2013_cookie_uu
class _FakeURLOpener(object):
@ -141,9 +150,10 @@ class IMDbURLopener(FancyURLopener):
for header in ('User-Agent', 'User-agent', 'user-agent'):
self.del_header(header)
self.set_header('User-Agent', 'Mozilla/5.0')
self.set_header('Accept-Language', 'en-us,en;q=0.5')
# XXX: This class is used also to perform "Exact Primary
# [Title|Name]" searches, and so by default the cookie is set.
c_header = 'id=%s; uu=%s' % (_cookie_id, _cookie_uu)
c_header = 'uu=%s; id=%s' % (_cookie_uu, _cookie_id)
self.set_header('Cookie', c_header)
def get_proxy(self):
@ -199,12 +209,11 @@ class IMDbURLopener(FancyURLopener):
server_encode = uopener.info().getparam('charset')
# Otherwise, look at the content-type HTML meta tag.
if server_encode is None and content:
first_bytes = content[:512]
begin_h = first_bytes.find('text/html; charset=')
begin_h = content.find('text/html; charset=')
if begin_h != -1:
end_h = first_bytes[19+begin_h:].find('"')
end_h = content[19+begin_h:].find('"')
if end_h != -1:
server_encode = first_bytes[19+begin_h:19+begin_h+end_h]
server_encode = content[19+begin_h:19+begin_h+end_h]
if server_encode:
try:
if lookup(server_encode):
@ -455,16 +464,16 @@ class IMDbHTTPAccessSystem(IMDbBase):
results is the maximum number of results to be retrieved."""
if isinstance(ton, unicode):
try:
ton = ton.encode('iso8859-1')
ton = ton.encode('utf-8')
except Exception, e:
try:
ton = ton.encode('utf-8')
ton = ton.encode('iso8859-1')
except Exception, e:
pass
##params = 'q=%s&%s=on&mx=%s' % (quote_plus(ton), kind, str(results))
params = 'q=%s;s=%s;mx=%s' % (quote_plus(ton), kind, str(results))
params = 'q=%s&s=%s&mx=%s' % (quote_plus(ton), kind, str(results))
if kind == 'ep':
params = params.replace('s=ep;', 's=tt;ttype=ep;', 1)
params = params.replace('s=ep&', 's=tt&ttype=ep&', 1)
cont = self._retrieve(self.urls['find'] % params)
#print 'URL:', imdbURL_find % params
if cont.find('Your search returned more than') == -1 or \
@ -472,7 +481,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
return cont
# The retrieved page contains no results, because too many
# titles or names contain the string we're looking for.
params = 'q=%s;ls=%s;lm=0' % (quote_plus(ton), kind)
params = 'q=%s&ls=%s&lm=0' % (quote_plus(ton), kind)
size = 131072 + results * 512
return self._retrieve(self.urls['find'] % params, size=size)
@ -587,6 +596,10 @@ class IMDbHTTPAccessSystem(IMDbBase):
cont = self._retrieve(self.urls['movie_main'] % movieID + 'recommendations')
return self.mProxy.rec_parser.parse(cont)
def get_movie_critic_reviews(self, movieID):
cont = self._retrieve(self.urls['movie_main'] % movieID + 'criticreviews')
return self.mProxy.criticrev_parser.parse(cont)
def get_movie_external_reviews(self, movieID):
cont = self._retrieve(self.urls['movie_main'] % movieID + 'externalreviews')
return self.mProxy.externalrev_parser.parse(cont)
@ -754,7 +767,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
return self.pProxy.person_keywords_parser.parse(cont)
def _search_character(self, name, results):
cont = self._get_search_content('char', name, results)
cont = self._get_search_content('ch', name, results)
return self.scProxy.search_character_parser.parse(cont, results=results)['data']
def get_character_main(self, characterID):

View File

@ -9,7 +9,7 @@ pages would be:
plot summary: http://akas.imdb.com/title/tt0094226/plotsummary
...and so on...
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
@ -531,9 +531,6 @@ class DOMHTMLMovieParser(DOMParserBase):
def _process_plotsummary(x):
"""Process a plot (contributed by Rdian06)."""
xauthor = x.get('author')
if xauthor:
xauthor = xauthor.replace('{', '<').replace('}', '>').replace('(',
'<').replace(')', '>').strip()
xplot = x.get('plot', u'').strip()
if xauthor:
xplot += u'::%s' % xauthor
@ -555,17 +552,20 @@ class DOMHTMLPlotParser(DOMParserBase):
# Notice that recently IMDb started to put the email of the
# author only in the link, that we're not collecting, here.
extractors = [Extractor(label='plot',
path="//p[@class='plotpar']",
attrs=Attribute(key='plot',
multi=True,
path={'plot': './text()',
'author': './i/a/text()'},
postprocess=_process_plotsummary))]
path="//ul[@class='zebraList']//p",
attrs=Attribute(key='plot',
multi=True,
path={'plot': './text()[1]',
'author': './span/em/a/text()'},
postprocess=_process_plotsummary))]
def _process_award(x):
award = {}
award['award'] = x.get('award').strip()
_award = x.get('award')
if _award is not None:
_award = _award.strip()
award['award'] = _award
if not award['award']:
return {}
award['year'] = x.get('year').strip()
@ -709,10 +709,16 @@ class DOMHTMLTaglinesParser(DOMParserBase):
result = tparser.parse(taglines_html_string)
"""
extractors = [Extractor(label='taglines',
path="//div[@id='tn15content']/p",
attrs=Attribute(key='taglines', multi=True,
path='//*[contains(concat(" ", normalize-space(@class), " "), " soda ")]',
attrs=Attribute(key='taglines',
multi=True,
path="./text()"))]
def postprocess_data(self, data):
if 'taglines' in data:
data['taglines'] = [tagline.strip() for tagline in data['taglines']]
return data
class DOMHTMLKeywordsParser(DOMParserBase):
"""Parser for the "keywords" page of a given movie.
@ -785,9 +791,9 @@ class DOMHTMLSoundtrackParser(DOMHTMLAlternateVersionsParser):
]
def postprocess_data(self, data):
if 'soundtrack' in data:
if 'alternate versions' in data:
nd = []
for x in data['soundtrack']:
for x in data['alternate versions']:
ds = x.split('\n')
title = ds[0]
if title[0] == '"' and title[-1] == '"':
@ -846,6 +852,13 @@ class DOMHTMLCrazyCreditsParser(DOMParserBase):
x.replace('\n', ' ').replace(' ', ' ')))]
def _process_goof(x):
if x['spoiler_category']:
return x['spoiler_category'].strip() + ': SPOILER: ' + x['text'].strip()
else:
return x['category'].strip() + ': ' + x['text'].strip()
class DOMHTMLGoofsParser(DOMParserBase):
"""Parser for the "goofs" page of a given movie.
The page should be provided as a string, as taken from
@ -858,9 +871,14 @@ class DOMHTMLGoofsParser(DOMParserBase):
"""
_defGetRefs = True
extractors = [Extractor(label='goofs', path="//ul[@class='trivia']/li",
attrs=Attribute(key='goofs', multi=True, path=".//text()",
postprocess=lambda x: (x or u'').strip()))]
extractors = [Extractor(label='goofs', path="//div[@class='soda odd']",
attrs=Attribute(key='goofs', multi=True,
path={
'text':"./text()",
'category':'./preceding-sibling::h4[1]/text()',
'spoiler_category': './h4/text()'
},
postprocess=_process_goof))]
class DOMHTMLQuotesParser(DOMParserBase):
@ -876,9 +894,16 @@ class DOMHTMLQuotesParser(DOMParserBase):
_defGetRefs = True
extractors = [
Extractor(label='quotes',
path="//div[@class='_imdbpy']",
attrs=Attribute(key='quotes',
Extractor(label='quotes_odd',
path="//div[@class='quote soda odd']",
attrs=Attribute(key='quotes_odd',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip().replace(' \n',
'::').replace('::\n', '::').replace('\n', ' '))),
Extractor(label='quotes_even',
path="//div[@class='quote soda even']",
attrs=Attribute(key='quotes_even',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip().replace(' \n',
@ -886,27 +911,23 @@ class DOMHTMLQuotesParser(DOMParserBase):
]
preprocessors = [
(re.compile('(<a name="?qt[0-9]{7}"?></a>)', re.I),
r'\1<div class="_imdbpy">'),
(re.compile('<hr width="30%">', re.I), '</div>'),
(re.compile('<hr/>', re.I), '</div>'),
(re.compile('<script.*?</script>', re.I|re.S), ''),
# For BeautifulSoup.
(re.compile('<!-- sid: t-channel : MIDDLE_CENTER -->', re.I), '</div>')
]
(re.compile('<a href="#" class="hidesoda hidden">Hide options</a><br>', re.I), '')
]
def preprocess_dom(self, dom):
# Remove "link this quote" links.
for qLink in self.xpath(dom, "//p[@class='linksoda']"):
for qLink in self.xpath(dom, "//span[@class='linksoda']"):
qLink.drop_tree()
for qLink in self.xpath(dom, "//div[@class='sharesoda_pre']"):
qLink.drop_tree()
return dom
def postprocess_data(self, data):
if 'quotes' not in data:
quotes = data.get('quotes_odd', []) + data.get('quotes_even', [])
if not quotes:
return {}
for idx, quote in enumerate(data['quotes']):
data['quotes'][idx] = quote.split('::')
return data
quotes = [q.split('::') for q in quotes]
return {'quotes': quotes}
class DOMHTMLReleaseinfoParser(DOMParserBase):
@ -920,13 +941,13 @@ class DOMHTMLReleaseinfoParser(DOMParserBase):
result = rdparser.parse(releaseinfo_html_string)
"""
extractors = [Extractor(label='release dates',
path="//th[@class='xxxx']/../../tr",
path="//table[@id='release_dates']//tr",
attrs=Attribute(key='release dates', multi=True,
path={'country': ".//td[1]//text()",
'date': ".//td[2]//text()",
'notes': ".//td[3]//text()"})),
Extractor(label='akas',
path="//div[@class='_imdbpy_akas']/table/tr",
path="//table[@id='akas']//tr",
attrs=Attribute(key='akas', multi=True,
path={'title': "./td[1]/text()",
'countries': "./td[2]/text()"}))]
@ -961,7 +982,7 @@ class DOMHTMLReleaseinfoParser(DOMParserBase):
title = (aka.get('title') or '').strip()
if not title:
continue
countries = (aka.get('countries') or '').split('/')
countries = (aka.get('countries') or '').split(',')
if not countries:
nakas.append(title)
else:
@ -1135,7 +1156,28 @@ def _normalize_href(href):
href = '%s%s' % (imdbURL_base, href)
return href
class DOMHTMLCriticReviewsParser(DOMParserBase):
"""Parser for the "critic reviews" pages of a given movie.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
osparser = DOMHTMLCriticReviewsParser()
result = osparser.parse(officialsites_html_string)
"""
kind = 'critic reviews'
extractors = [
Extractor(label='metascore',
path="//div[@class='metascore_wrap']/div/span",
attrs=Attribute(key='metascore',
path=".//text()")),
Extractor(label='metacritic url',
path="//div[@class='article']/div[@class='see-more']/a",
attrs=Attribute(key='metacritic url',
path="./@href")) ]
class DOMHTMLOfficialsitesParser(DOMParserBase):
"""Parser for the "official sites", "external reviews", "newsgroup
reviews", "miscellaneous links", "sound clips", "video clips" and
@ -1471,6 +1513,14 @@ class DOMHTMLSeasonEpisodesParser(DOMParserBase):
try: selected_season = int(selected_season)
except: pass
nd = {selected_season: {}}
if 'episode -1' in data:
counter = 1
for episode in data['episode -1']:
while 'episode %d' % counter in data:
counter += 1
k = 'episode %d' % counter
data[k] = [episode]
del data['episode -1']
for episode_nr, episode in data.iteritems():
if not (episode and episode[0] and
episode_nr.startswith('episode ')):
@ -1860,6 +1910,8 @@ _OBJECTS = {
'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None),
'ratings_parser': ((DOMHTMLRatingsParser,), None),
'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
'criticrev_parser': ((DOMHTMLCriticReviewsParser,),
{'kind': 'critic reviews'}),
'externalrev_parser': ((DOMHTMLOfficialsitesParser,),
{'kind': 'external reviews'}),
'newsgrouprev_parser': ((DOMHTMLOfficialsitesParser,),

View File

@ -8,7 +8,7 @@ E.g., for "Mel Gibson" the referred pages would be:
biography: http://akas.imdb.com/name/nm0000154/bio
...and so on...
Copyright 2004-20101 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
@ -60,6 +60,7 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
result = cparser.parse(categorized_html_string)
"""
_containsObjects = True
_name_imdb_index = re.compile(r'\([IVXLCDM]+\)')
_birth_attrs = [Attribute(key='birth date',
path='.//time[@itemprop="birthDate"]/@datetime'),
@ -100,6 +101,10 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
path=".//text()",
postprocess=lambda x: analyze_name(x,
canonical=1))),
Extractor(label='name_index',
path="//h1[@class='header']/span[1]",
attrs=Attribute(key='name_index',
path="./text()")),
Extractor(label='birth info',
path="//div[h4='Born:']",
@ -110,7 +115,7 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
attrs=_death_attrs),
Extractor(label='headshot',
path="//td[@id='img_primary']/a",
path="//td[@id='img_primary']/div[@class='image']/a",
attrs=Attribute(key='headshot',
path="./img/@src")),
@ -152,6 +157,11 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
for what in 'birth date', 'death date':
if what in data and not data[what]:
del data[what]
name_index = (data.get('name_index') or '').strip()
if name_index:
if self._name_imdb_index.match(name_index):
data['imdbIndex'] = name_index[1:-1]
del data['name_index']
# XXX: the code below is for backwards compatibility
# probably could be removed
for key in data.keys():
@ -220,13 +230,13 @@ class DOMHTMLBioParser(DOMParserBase):
attrs=Attribute(key='headshot',
path="./img/@src")),
Extractor(label='birth info',
path="//div[h5='Date of Birth']",
path="//table[@id='overviewTable']//td[text()='Date of Birth']/following-sibling::td[1]",
attrs=_birth_attrs),
Extractor(label='death info',
path="//div[h5='Date of Death']",
path="//table[@id='overviewTable']//td[text()='Date of Death']/following-sibling::td[1]",
attrs=_death_attrs),
Extractor(label='nick names',
path="//div[h5='Nickname']",
path="//table[@id='overviewTable']//td[text()='Nickenames']/following-sibling::td[1]",
attrs=Attribute(key='nick names',
path="./text()",
joiner='|',
@ -234,25 +244,25 @@ class DOMHTMLBioParser(DOMParserBase):
'::(', 1) for n in x.split('|')
if n.strip()])),
Extractor(label='birth name',
path="//div[h5='Birth Name']",
path="//table[@id='overviewTable']//td[text()='Birth Name']/following-sibling::td[1]",
attrs=Attribute(key='birth name',
path="./text()",
postprocess=lambda x: canonicalName(x.strip()))),
Extractor(label='height',
path="//div[h5='Height']",
path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
attrs=Attribute(key='height',
path="./text()",
postprocess=lambda x: x.strip())),
Extractor(label='mini biography',
path="//div[h5='Mini Biography']",
path="//a[@name='mini_bio']/following-sibling::div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
attrs=Attribute(key='mini biography',
multi=True,
path={
'bio': "./p//text()",
'by': "./b/following-sibling::a/text()"
'bio': ".//text()",
'by': ".//a[@name='ba']//text()"
},
postprocess=lambda x: "%s::%s" % \
(x.get('bio').strip(),
((x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(),
(x.get('by') or u'').strip() or u'Anonymous'))),
Extractor(label='spouse',
path="//div[h5='Spouse']/table/tr",

View File

@ -5,9 +5,9 @@ This module provides the HTMLSearchCharacterParser class (and the
search_character_parser instance), used to parse the results of a search
for a given character.
E.g., when searching for the name "Jesse James", the parsed page would be:
http://akas.imdb.com/find?s=Characters;mx=20;q=Jesse+James
http://akas.imdb.com/find?s=ch;mx=20;q=Jesse+James
Copyright 2007-2009 Davide Alberani <da@erlug.linux.it>
Copyright 2007-2012 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
@ -42,7 +42,7 @@ class DOMBasicCharacterParser(DOMBasicMovieParser):
class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser):
_BaseParser = DOMBasicCharacterParser
_notDirectHitTitle = '<title>imdb search'
_notDirectHitTitle = '<title>find - imdb'
_titleBuilder = lambda self, x: build_name(x, canonical=False)
_linkPrefix = '/character/ch'
@ -57,7 +57,7 @@ class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser):
{'name': x.get('name')}
))]
extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, " \
path="//td[@class='result_text']/a[starts-with(@href, " \
"'/character/ch')]/..",
attrs=_attrs)]

View File

@ -7,7 +7,7 @@ for a given company.
E.g., when searching for the name "Columbia Pictures", the parsed page would be:
http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it>
Copyright 2008-2012 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
@ -42,7 +42,7 @@ class DOMBasicCompanyParser(DOMBasicMovieParser):
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
_BaseParser = DOMBasicCompanyParser
_notDirectHitTitle = '<title>imdb company'
_notDirectHitTitle = '<title>find - imdb'
_titleBuilder = lambda self, x: build_company_name(x)
_linkPrefix = '/company/co'
@ -59,7 +59,7 @@ class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
or u''), stripNotes=True)
))]
extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, " \
path="//td[@class='result_text']/a[starts-with(@href, " \
"'/company/co')]/..",
attrs=_attrs)]

View File

@ -8,7 +8,7 @@ E.g., for when searching for the title "the passion", the parsed
page would be:
http://akas.imdb.com/find?q=the+passion&tt=on&mx=20
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
@ -77,7 +77,7 @@ class DOMBasicMovieParser(DOMParserBase):
def custom_analyze_title(title):
"""Remove garbage notes after the (year), (year/imdbIndex) or (year) (TV)"""
# XXX: very crappy. :-(
nt = title.split(' ')[0]
nt = title.split(' aka ')[0]
if nt:
title = nt
if not title:
@ -92,7 +92,7 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
"new search system" is used, for movies."""
_BaseParser = DOMBasicMovieParser
_notDirectHitTitle = '<title>imdb title'
_notDirectHitTitle = '<title>find - imdb</title>'
_titleBuilder = lambda self, x: build_title(x)
_linkPrefix = '/title/tt'
@ -101,8 +101,7 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
path={
'link': "./a[1]/@href",
'info': ".//text()",
#'akas': ".//div[@class='_imdbpyAKA']//text()"
'akas': ".//p[@class='find-aka']//text()"
'akas': "./i//text()"
},
postprocess=lambda x: (
analyze_imdbid(x.get('link') or u''),
@ -110,7 +109,7 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
x.get('akas')
))]
extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, '/title/tt')]/..",
path="//td[@class='result_text']",
attrs=_attrs)]
def _init(self):
self.url = u''
@ -119,14 +118,11 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
self.url = u''
def preprocess_string(self, html_string):
if self._notDirectHitTitle in html_string[:1024].lower():
if self._notDirectHitTitle in html_string[:10240].lower():
if self._linkPrefix == '/title/tt':
# Only for movies.
# XXX (HTU): does this still apply?
html_string = html_string.replace('(TV mini-series)', '(mini)')
html_string = html_string.replace('<p class="find-aka">',
'<p class="find-aka">::')
#html_string = _reAKAStitles.sub(
# r'<div class="_imdbpyAKA">\1::</div>\2', html_string)
return html_string
# Direct hit!
dbme = self._BaseParser(useModule=self._useModule)
@ -141,7 +137,7 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
title = self._titleBuilder(res[0][1])
if not (link and title): return u''
link = link.replace('http://pro.imdb.com', '')
new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link,
new_html = '<td class="result_text"><a href="%s">%s</a></td>' % (link,
title)
return new_html
@ -161,11 +157,14 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
if not datum[0] and datum[1]:
continue
if datum[2] is not None:
akas = filter(None, datum[2].split('::'))
#akas = filter(None, datum[2].split('::'))
if self._linkPrefix == '/title/tt':
akas = [a.replace('" - ', '::').rstrip() for a in akas]
akas = [a.replace('aka "', '', 1).replace('aka "',
'', 1).lstrip() for a in akas]
# XXX (HTU): couldn't find a result with multiple akas
aka = datum[2]
akas = [aka[1:-1]] # remove the quotes
#akas = [a.replace('" - ', '::').rstrip() for a in akas]
#akas = [a.replace('aka "', '', 1).replace('aka "',
#'', 1).lstrip() for a in akas]
datum[1]['akas'] = akas
data['data'][idx] = (datum[0], datum[1])
else:

View File

@ -7,7 +7,7 @@ for a given person.
E.g., when searching for the name "Mel Gibson", the parsed page would be:
http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
@ -55,7 +55,7 @@ class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
"""Parse the html page that the IMDb web server shows when the
"new search system" is used, for persons."""
_BaseParser = DOMBasicPersonParser
_notDirectHitTitle = '<title>imdb name'
_notDirectHitTitle = '<title>find - imdb'
_titleBuilder = lambda self, x: build_name(x, canonical=True)
_linkPrefix = '/name/nm'
@ -74,11 +74,11 @@ class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
canonical=1), x.get('akas')
))]
extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, '/name/nm')]/..",
path="//td[@class='result_text']/a[starts-with(@href, '/name/nm')]/..",
attrs=_attrs)]
def preprocess_string(self, html_string):
if self._notDirectHitTitle in html_string[:1024].lower():
if self._notDirectHitTitle in html_string[:10240].lower():
html_string = _reAKASp.sub(
r'\1<div class="_imdbpyAKA">\2::</div>\3',
html_string)

View File

@ -340,7 +340,7 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
title = title[:nidx].rstrip()
if year:
year = year.strip()
if title[-1] == ')':
if title[-1:] == ')':
fpIdx = title.rfind('(')
if fpIdx != -1:
if notes: notes = '%s %s' % (title[fpIdx:], notes)

View File

@ -6,7 +6,7 @@ IMDb's data for mobile systems.
the imdb.IMDb function will return an instance of this class when
called with the 'accessSystem' argument set to "mobile".
Copyright 2005-2011 Davide Alberani <da@erlug.linux.it>
Copyright 2005-2012 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -193,7 +193,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
title)
return res
tl = title[0].lower()
if not tl.startswith('imdb title'):
if not tl.startswith('find - imdb'):
# a direct hit!
title = _unHtml(title[0])
mid = None
@ -211,7 +211,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
# XXX: this results*3 prevents some recursion errors, but...
# it's not exactly understandable (i.e.: why 'results' is
# not enough to get all the results?)
lis = _findBetween(cont, 'td valign="top">', '</td>',
lis = _findBetween(cont, 'td class="result_text">', '</td>',
maxRes=results*3)
for li in lis:
akas = re_makas.findall(li)
@ -492,7 +492,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
self._mobile_logger.warn('no title tag searching for name %s', name)
return res
nl = name[0].lower()
if not nl.startswith('imdb name'):
if not nl.startswith('find - imdb'):
# a direct hit!
name = _unHtml(name[0])
name = name.replace('- Filmography by type' , '').strip()
@ -506,7 +506,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return res
res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
else:
lis = _findBetween(cont, 'td valign="top">', '</td>',
lis = _findBetween(cont, 'td class="result_text">', '</td>',
maxRes=results*3)
for li in lis:
akas = _findBetween(li, '<em>"', '"</em>')
@ -771,7 +771,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return {'data': d}
def _search_character(self, name, results):
cont = subXMLRefs(self._get_search_content('char', name, results))
cont = subXMLRefs(self._get_search_content('ch', name, results))
name = _findBetween(cont, '<title>', '</title>', maxRes=1)
res = []
if not name:
@ -779,8 +779,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
name)
return res
nl = name[0].lower()
if not (nl.startswith('imdb search') or nl.startswith('imdb search') \
or nl.startswith('imdb character')):
if not nl.startswith('find - imdb'):
# a direct hit!
name = _unHtml(name[0]).replace('(Character)', '').strip()
pid = None
@ -793,23 +792,18 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return res
res[:] = [(str(pid[0]), analyze_name(name))]
else:
sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>',
maxRes=results*3)
sects += _findBetween(cont, '<b>Characters', '</table>',
maxRes=results*3)
for sect in sects:
lis = _findBetween(sect, '<a href="/character/',
['<small', '</td>', '<br'])
for li in lis:
li = '<%s' % li
pid = re_imdbID.findall(li)
pname = _unHtml(li)
if not (pid and pname):
self._mobile_logger.debug('no name/characterID' \
' parsing %s searching for' \
' character %s', li, name)
continue
res.append((str(pid[0]), analyze_name(pname)))
lis = _findBetween(cont, '<td class="result_text"',
['<small', '</td>', '<br'])
for li in lis:
li = '<%s' % li
pid = re_imdbID.findall(li)
pname = _unHtml(li)
if not (pid and pname):
self._mobile_logger.debug('no name/characterID' \
' parsing %s searching for' \
' character %s', li, name)
continue
res.append((str(pid[0]), analyze_name(pname)))
return res
def get_character_main(self, characterID):

View File

@ -7,7 +7,7 @@ the SQLObject _AND_ SQLAlchemy Object Relational Managers is available.
the imdb.IMDb function will return an instance of this class when
called with the 'accessSystem' argument set to "sql", "database" or "db".
Copyright 2005-2010 Davide Alberani <da@erlug.linux.it>
Copyright 2005-2012 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -452,7 +452,12 @@ def get_movie_data(movieID, kindDict, fromAka=0, _table=None):
else:
if not fromAka: Table = Title
else: Table = AkaTitle
m = Table.get(movieID)
try:
m = Table.get(movieID)
except Exception, e:
_aux_logger.warn('Unable to fetch information for movieID %s: %s', movieID, e)
mdict = {}
return mdict
mdict = {'title': m.title, 'kind': kindDict[m.kindID],
'year': m.productionYear, 'imdbIndex': m.imdbIndex,
'season': m.seasonNr, 'episode': m.episodeNr}
@ -825,14 +830,14 @@ class IMDbSqlAccessSystem(IMDbBase):
imdbID = movie.imdbID
if imdbID is not None: return '%07d' % imdbID
m_dict = get_movie_data(movie.id, self._kind)
titline = build_title(m_dict, ptdf=1)
imdbID = self.title2imdbID(titline)
titline = build_title(m_dict, ptdf=0)
imdbID = self.title2imdbID(titline, m_dict['kind'])
# If the imdbID was retrieved from the web and was not in the
# database, update the database (ignoring errors, because it's
# possibile that the current user has not update privileges).
# There're times when I think I'm a genius; this one of
# those times... <g>
if imdbID is not None:
if imdbID is not None and not isinstance(imdbID, list):
try: movie.imdbID = int(imdbID)
except: pass
return imdbID
@ -847,9 +852,9 @@ class IMDbSqlAccessSystem(IMDbBase):
imdbID = person.imdbID
if imdbID is not None: return '%07d' % imdbID
n_dict = {'name': person.name, 'imdbIndex': person.imdbIndex}
namline = build_name(n_dict, canonical=1)
namline = build_name(n_dict, canonical=False)
imdbID = self.name2imdbID(namline)
if imdbID is not None:
if imdbID is not None and not isinstance(imdbID, list):
try: person.imdbID = int(imdbID)
except: pass
return imdbID
@ -864,9 +869,9 @@ class IMDbSqlAccessSystem(IMDbBase):
imdbID = character.imdbID
if imdbID is not None: return '%07d' % imdbID
n_dict = {'name': character.name, 'imdbIndex': character.imdbIndex}
namline = build_name(n_dict, canonical=1)
namline = build_name(n_dict, canonical=False)
imdbID = self.character2imdbID(namline)
if imdbID is not None:
if imdbID is not None and not isinstance(imdbID, list):
try: character.imdbID = int(imdbID)
except: pass
return imdbID
@ -883,7 +888,7 @@ class IMDbSqlAccessSystem(IMDbBase):
n_dict = {'name': company.name, 'country': company.countryCode}
namline = build_company_name(n_dict)
imdbID = self.company2imdbID(namline)
if imdbID is not None:
if imdbID is not None and not isinstance(imdbID, list):
try: company.imdbID = int(imdbID)
except: pass
return imdbID
@ -1116,8 +1121,9 @@ class IMDbSqlAccessSystem(IMDbBase):
if mlinks:
for ml in mlinks:
lmovieData = get_movie_data(ml[0], self._kind)
m = Movie(movieID=ml[0], data=lmovieData, accessSystem='sql')
ml[0] = m
if lmovieData:
m = Movie(movieID=ml[0], data=lmovieData, accessSystem='sql')
ml[0] = m
res['connections'] = {}
mlinks[:] = _groupListBy(mlinks, 1)
for group in mlinks:

View File

@ -466,6 +466,7 @@ class _AlchemyConnection(object):
def setConnection(uri, tables, encoding='utf8', debug=False):
"""Set connection for every table."""
params = {'encoding': encoding}
# FIXME: why on earth MySQL requires an additional parameter,
# is well beyond my understanding...
if uri.startswith('mysql'):
@ -474,7 +475,11 @@ def setConnection(uri, tables, encoding='utf8', debug=False):
else:
uri += '?'
uri += 'charset=%s' % encoding
params = {'encoding': encoding}
# On some server configurations, we will need to explictly enable
# loading data from local files
params['local_infile'] = 1
if debug:
params['echo'] = True
if uri.startswith('ibm_db'):

Binary file not shown.

View File

@ -182,6 +182,10 @@ def setConnection(uri, tables, encoding='utf8', debug=False):
kw['use_unicode'] = 1
#kw['sqlobject_encoding'] = encoding
kw['charset'] = encoding
# On some server configurations, we will need to explictly enable
# loading data from local files
kw['local_infile'] = 1
conn = connectionForURI(uri, **kw)
conn.debug = debug
# XXX: doesn't work and a work-around was put in imdbpy2sql.py;

View File

@ -3,7 +3,7 @@ utils module (imdb package).
This module provides basic utilities for the imdb package.
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it>
Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2009 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify
@ -189,10 +189,9 @@ _unicodeArticles = linguistics.toUnicode(_articles)
articlesDicts = linguistics.articlesDictsForLang(None)
spArticles = linguistics.spArticlesForLang(None)
def canonicalTitle(title, lang=None):
def canonicalTitle(title, lang=None, imdbIndex=None):
"""Return the title in the canonic format 'Movie Title, The';
beware that it doesn't handle long imdb titles, but only the
title portion, without year[/imdbIndex] or special markup.
beware that it doesn't handle long imdb titles.
The 'lang' argument can be used to specify the language of the title.
"""
isUnicode = isinstance(title, unicode)
@ -203,15 +202,19 @@ def canonicalTitle(title, lang=None):
except IndexError:
pass
if isUnicode:
_format = u'%s, %s'
_format = u'%s%s, %s'
else:
_format = '%s, %s'
_format = '%s%s, %s'
ltitle = title.lower()
if imdbIndex:
imdbIndex = ' (%s)' % imdbIndex
else:
imdbIndex = ''
spArticles = linguistics.spArticlesForLang(lang)
for article in spArticles[isUnicode]:
if ltitle.startswith(article):
lart = len(article)
title = _format % (title[lart:], title[:lart])
title = _format % (title[lart:], imdbIndex, title[:lart])
if article[-1] == ' ':
title = title[:-1]
break
@ -383,18 +386,42 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
if title.endswith('(TV)'):
kind = u'tv movie'
title = title[:-4].rstrip()
elif title.endswith('(TV Movie)'):
kind = u'tv movie'
title = title[:-10].rstrip()
elif title.endswith('(V)'):
kind = u'video movie'
title = title[:-3].rstrip()
elif title.endswith('(video)'):
elif title.lower().endswith('(video)'):
kind = u'video movie'
title = title[:-7].rstrip()
elif title.endswith('(TV Short)'):
kind = u'tv short'
title = title[:-10].rstrip()
elif title.endswith('(TV Mini-Series)'):
kind = u'tv mini series'
title = title[:-16].rstrip()
elif title.endswith('(mini)'):
kind = u'tv mini series'
title = title[:-6].rstrip()
elif title.endswith('(VG)'):
kind = u'video game'
title = title[:-4].rstrip()
elif title.endswith('(Video Game)'):
kind = u'video game'
title = title[:-12].rstrip()
elif title.endswith('(TV Series)'):
epindex = title.find('(TV Episode) - ')
if epindex >= 0:
# It's an episode of a series.
kind = u'episode'
series_info = analyze_title(title[epindex + 15:])
result['episode of'] = series_info.get('title')
result['series year'] = series_info.get('year')
title = title[:epindex]
else:
kind = u'tv series'
title = title[:-11].rstrip()
# Search for the year and the optional imdbIndex (a roman number).
yi = re_year_index.findall(title)
if not yi:
@ -430,9 +457,6 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
if not kind:
kind = u'tv series'
title = title[1:-1].strip()
elif title.endswith('(TV series)'):
kind = u'tv series'
title = title[:-11].rstrip()
if not title:
raise IMDbParserError('invalid title: "%s"' % original_t)
if canonical is not None:
@ -489,7 +513,7 @@ def _convertTime(title, fromPTDFtoWEB=1, _emptyString=u''):
def build_title(title_dict, canonical=None, canonicalSeries=None,
canonicalEpisode=None, ptdf=0, lang=None, _doYear=1,
_emptyString=u''):
_emptyString=u'', appendKind=True):
"""Given a dictionary that represents a "long" IMDb title,
return a string.
@ -511,6 +535,11 @@ def build_title(title_dict, canonical=None, canonicalSeries=None,
doYear = 0
if ptdf:
doYear = 1
# XXX: for results coming from the new search page.
if not isinstance(episode_of, (dict, _Container)):
episode_of = {'title': episode_of, 'kind': 'tv series'}
if 'series year' in title_dict:
episode_of['year'] = title_dict['series year']
pre_title = build_title(episode_of, canonical=canonicalSeries,
ptdf=0, _doYear=doYear,
_emptyString=_emptyString)
@ -545,12 +574,14 @@ def build_title(title_dict, canonical=None, canonicalSeries=None,
episode_title += '.%s' % episode
episode_title += ')'
episode_title = '{%s}' % episode_title
return '%s %s' % (pre_title, episode_title)
return _emptyString + '%s %s' % (_emptyString + pre_title,
_emptyString + episode_title)
title = title_dict.get('title', '')
imdbIndex = title_dict.get('imdbIndex', '')
if not title: return _emptyString
if canonical is not None:
if canonical:
title = canonicalTitle(title, lang=lang)
title = canonicalTitle(title, lang=lang, imdbIndex=imdbIndex)
else:
title = normalizeTitle(title, lang=lang)
if pre_title:
@ -558,15 +589,20 @@ def build_title(title_dict, canonical=None, canonicalSeries=None,
if kind in (u'tv series', u'tv mini series'):
title = '"%s"' % title
if _doYear:
imdbIndex = title_dict.get('imdbIndex')
year = title_dict.get('year') or u'????'
year = title_dict.get('year') or '????'
if isinstance(_emptyString, str):
year = str(year)
title += ' (%s' % year
if imdbIndex:
title += '/%s' % imdbIndex
title += ')'
if kind:
imdbIndex = title_dict.get('imdbIndex')
if not ptdf:
if imdbIndex and (canonical is None or canonical):
title += ' (%s)' % imdbIndex
title += ' (%s)' % year
else:
title += ' (%s' % year
if imdbIndex and (canonical is None or canonical):
title += '/%s' % imdbIndex
title += ')'
if appendKind and kind:
if kind == 'tv movie':
title += ' (TV)'
elif kind == 'video movie':

View File

@ -11,6 +11,7 @@ __author__ = "dbr/Ben"
__version__ = "1.9"
import os
import re
import time
import getpass
import StringIO
@ -18,8 +19,10 @@ import tempfile
import warnings
import logging
import zipfile
import datetime as dt
import requests
import cachecontrol
import xmltodict
try:
import xml.etree.cElementTree as ElementTree
@ -31,6 +34,7 @@ try:
except ImportError:
gzip = None
from lib.dateutil.parser import parse
from cachecontrol import caches
from tvdb_ui import BaseUI, ConsoleUI
@ -560,44 +564,71 @@ class Tvdb:
except requests.Timeout, e:
raise tvdb_error("Connection timed out " + str(e.message) + " while loading URL " + str(url))
if 'application/zip' in resp.headers.get("Content-Type", '') and resp.ok:
try:
# TODO: The zip contains actors.xml and banners.xml, which are currently ignored [GH-20]
log().debug("We recived a zip file unpacking now ...")
zipdata = StringIO.StringIO()
zipdata.write(resp.content)
myzipfile = zipfile.ZipFile(zipdata)
return myzipfile.read('%s.xml' % language)
except zipfile.BadZipfile:
raise tvdb_error("Bad zip file received from thetvdb.com, could not read it")
def process(path, key, value):
key = key.lower()
return resp.content if resp.ok else None
# clean up value and do type changes
if value:
try:
# convert to integer if needed
if value.isdigit():
value = int(value)
except:
pass
if key in ['banner', 'fanart', 'poster']:
value = self.config['url_artworkPrefix'] % (value)
else:
value = self._cleanData(value)
try:
if key == 'firstaired' and value in "0000-00-00":
new_value = str(dt.date.fromordinal(1))
new_value = re.sub("([-]0{2}){1,}", "", new_value)
fixDate = parse(new_value, fuzzy=True).date()
value = fixDate.strftime("%Y-%m-%d")
elif key == 'firstaired':
value = parse(value, fuzzy=True).date()
value = value.strftime("%Y-%m-%d")
except:
pass
value = self._cleanData(value)
return (key, value)
if resp.ok:
if 'application/zip' in resp.headers.get("Content-Type", ''):
try:
# TODO: The zip contains actors.xml and banners.xml, which are currently ignored [GH-20]
log().debug("We recived a zip file unpacking now ...")
zipdata = StringIO.StringIO()
zipdata.write(resp.content)
myzipfile = zipfile.ZipFile(zipdata)
return xmltodict.parse(myzipfile.read('%s.xml' % language), postprocessor=process)
except zipfile.BadZipfile:
raise tvdb_error("Bad zip file received from thetvdb.com, could not read it")
else:
return xmltodict.parse(resp.text.strip(), postprocessor=process)
def _getetsrc(self, url, params=None, language=None):
"""Loads a URL using caching, returns an ElementTree of the source
"""
src = self._loadUrl(url, params=params, language=language)
try:
# TVDB doesn't sanitize \r (CR) from user input in some fields,
# remove it to avoid errors. Change from SickBeard, from will14m
return ElementTree.fromstring(src.rstrip("\r")) if src else None
except SyntaxError:
src = self._loadUrl(url, params=params, language=language)
try:
return ElementTree.fromstring(src.rstrip("\r")) if src else None
except SyntaxError, exceptionmsg:
errormsg = "There was an error with the XML retrieved from thetvdb.com:\n%s" % (
exceptionmsg
src = [src[item] for item in src][0]
except:
errormsg = "There was an error with the XML retrieved from thetvdb.com:"
if self.config['cache_enabled']:
errormsg += "\nFirst try emptying the cache folder at..\n%s" % (
self.config['cache_location']
)
if self.config['cache_enabled']:
errormsg += "\nFirst try emptying the cache folder at..\n%s" % (
self.config['cache_location']
)
errormsg += "\nIf this does not resolve the issue, please try again later. If the error persists, report a bug on"
errormsg += "\nhttp://dbr.lighthouseapp.com/projects/13342-tvdb_api/overview\n"
raise tvdb_error(errormsg)
errormsg += "\nIf this does not resolve the issue, please try again later. If the error persists, report a bug on"
errormsg += "\nhttp://dbr.lighthouseapp.com/projects/13342-tvdb_api/overview\n"
raise tvdb_error(errormsg)
return src
def _setItem(self, sid, seas, ep, attrib, value):
"""Creates a new episode, creating Show(), Season() and
@ -649,9 +680,8 @@ class Tvdb:
log().debug("Searching for show %s" % series)
self.config['params_getSeries']['seriesname'] = series
seriesEt = self._getetsrc(self.config['url_getSeries'], self.config['params_getSeries'])
allSeries = list(dict((s.tag.lower(), s.text) for s in x.getchildren()) for x in seriesEt)
return allSeries
return [seriesEt[item] for item in seriesEt][0]
def _getSeries(self, series):
"""This searches TheTVDB.com for the series name,
@ -798,24 +828,13 @@ class Tvdb:
self.config['url_seriesInfo'] % (sid, getShowInLanguage)
)
if seriesInfoEt is None: return False
for curInfo in seriesInfoEt.findall("Series")[0]:
tag = curInfo.tag.lower()
value = curInfo.text
# check and make sure we have data to process and that it contains a series name
if seriesInfoEt is None or 'seriesname' not in seriesInfoEt['series']:
return False
if tag == 'seriesname' and value is None:
return False
for k, v in seriesInfoEt['series'].items():
self._setShowData(sid, k, v)
if value is not None:
if tag == 'id':
value = int(value)
if tag in ['banner', 'fanart', 'poster']:
value = self.config['url_artworkPrefix'] % (value)
else:
value = self._cleanData(value)
self._setShowData(sid, tag, value)
if seriesSearch:
return True
@ -837,63 +856,40 @@ class Tvdb:
epsEt = self._getetsrc(url, language=language)
for cur_ep in epsEt.findall("Episode"):
for cur_ep in epsEt["episode"]:
if self.config['dvdorder']:
log().debug('Using DVD ordering.')
use_dvd = cur_ep.find('DVD_season').text != None and cur_ep.find('DVD_episodenumber').text != None
use_dvd = cur_ep['dvd_season'] != None and cur_ep['dvd_episodenumber'] != None
else:
use_dvd = False
if use_dvd:
elem_seasnum, elem_epno = cur_ep.find('DVD_season'), cur_ep.find('DVD_episodenumber')
seasnum, epno = cur_ep['dvd_season'], cur_ep['dvd_episodenumber']
else:
elem_seasnum, elem_epno = cur_ep.find('SeasonNumber'), cur_ep.find('EpisodeNumber')
if elem_seasnum is None or elem_epno is None:
seasnum, epno = cur_ep['seasonnumber'], cur_ep['episodenumber']
if seasnum is None or epno is None:
log().warning("An episode has incomplete season/episode number (season: %r, episode: %r)" % (
elem_seasnum, elem_epno))
log().debug(
" ".join(
"%r is %r" % (child.tag, child.text) for child in cur_ep.getchildren()))
# TODO: Should this happen?
seasnum, epno))
continue # Skip to next episode
# float() is because https://github.com/dbr/tvnamer/issues/95 - should probably be fixed in TVDB data
seas_no = int(float(elem_seasnum.text))
ep_no = int(float(elem_epno.text))
seas_no = int(float(seasnum))
ep_no = int(float(epno))
useDVD = False
for k,v in cur_ep.items():
k = k.lower()
if (self.config['dvdorder']):
log().debug('DVD Order? Yes')
useDVD = (cur_ep.find('DVD_season').text != None and cur_ep.find('DVD_episodenumber').text != None)
else:
log().debug('DVD Order? No')
if v is not None:
if k == 'id':
v = int(v)
if (useDVD):
log().debug('Use DVD Order? Yes')
seas_no = int(cur_ep.find('DVD_season').text)
ep_no = int(float(cur_ep.find('DVD_episodenumber').text))
else:
log().debug('Use DVD Order? No')
seas_no = int(cur_ep.find('SeasonNumber').text)
ep_no = int(cur_ep.find('EpisodeNumber').text)
for cur_item in cur_ep.getchildren():
tag = cur_item.tag.lower()
value = cur_item.text
if value is not None:
if tag == 'id':
value = int(value)
if tag == 'filename':
value = self.config['url_artworkPrefix'] % (value)
if k == 'filename':
v = self.config['url_artworkPrefix'] % (v)
else:
value = self._cleanData(value)
self._setItem(sid, seas_no, ep_no, tag, value)
v = self._cleanData(v)
self._setItem(sid, seas_no, ep_no, k, v)
return True

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python2
# !/usr/bin/env python2
#encoding:utf-8
#author:echel0n
#project:tvrage_api
@ -24,6 +24,7 @@ import logging
import datetime as dt
import requests
import cachecontrol
import xmltodict
try:
import xml.etree.cElementTree as ElementTree
@ -35,11 +36,13 @@ from cachecontrol import caches
from tvrage_ui import BaseUI
from tvrage_exceptions import (tvrage_error, tvrage_userabort, tvrage_shownotfound,
tvrage_seasonnotfound, tvrage_episodenotfound, tvrage_attributenotfound)
tvrage_seasonnotfound, tvrage_episodenotfound, tvrage_attributenotfound)
def log():
return logging.getLogger("tvrage_api")
def retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=None):
"""Retry calling the decorated function using an exponential backoff.
@ -83,6 +86,7 @@ def retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=None):
return deco_retry
class ShowContainer(dict):
"""Simple dict that holds a series of Show instances
"""
@ -105,13 +109,14 @@ class ShowContainer(dict):
_lastgc = time.time()
del tbd
super(ShowContainer, self).__setitem__(key, value)
class Show(dict):
"""Holds a dict of seasons, and show data.
"""
def __init__(self):
dict.__init__(self)
self.data = {}
@ -157,7 +162,7 @@ class Show(dict):
raise tvrage_episodenotfound("Could not find any episodes that aired on %s" % date)
return ret
def search(self, term = None, key = None):
def search(self, term=None, key=None):
"""
Search all episodes in show. Can search all data, or a specific key (for
example, episodename)
@ -173,7 +178,7 @@ class Show(dict):
"""
results = []
for cur_season in self.values():
searchresult = cur_season.search(term = term, key = key)
searchresult = cur_season.search(term=term, key=key)
if len(searchresult) != 0:
results.extend(searchresult)
@ -181,7 +186,7 @@ class Show(dict):
class Season(dict):
def __init__(self, show = None):
def __init__(self, show=None):
"""The show attribute points to the parent show
"""
self.show = show
@ -202,13 +207,13 @@ class Season(dict):
else:
return dict.__getitem__(self, episode_number)
def search(self, term = None, key = None):
def search(self, term=None, key=None):
"""Search all episodes in season, returns a list of matching Episode
instances.
"""
results = []
for ep in self.values():
searchresult = ep.search(term = term, key = key)
searchresult = ep.search(term=term, key=key)
if searchresult is not None:
results.append(
searchresult
@ -217,7 +222,7 @@ class Season(dict):
class Episode(dict):
def __init__(self, season = None):
def __init__(self, season=None):
"""The season attribute points to the parent season
"""
self.season = season
@ -242,7 +247,7 @@ class Episode(dict):
except KeyError:
raise tvrage_attributenotfound("Cannot find attribute %s" % (repr(key)))
def search(self, term = None, key = None):
def search(self, term=None, key=None):
"""Search episode data for term, if it matches, return the Episode (self).
The key parameter can be used to limit the search to a specific element,
for example, episodename.
@ -258,25 +263,27 @@ class Episode(dict):
if key is not None and cur_key != key:
# Do not search this key
continue
if cur_value.find( unicode(term).lower() ) > -1:
if cur_value.find(unicode(term).lower()) > -1:
return self
class TVRage:
"""Create easy-to-use interface to name of season/episode name"""
def __init__(self,
interactive = False,
select_first = False,
debug = False,
cache = True,
banners = False,
actors = False,
custom_ui = None,
language = None,
search_all_languages = False,
apikey = None,
forceConnect=False,
useZip=False,
dvdorder=False):
interactive=False,
select_first=False,
debug=False,
cache=True,
banners=False,
actors=False,
custom_ui=None,
language=None,
search_all_languages=False,
apikey=None,
forceConnect=False,
useZip=False,
dvdorder=False):
"""
cache (True/False/str/unicode/urllib2 opener):
@ -294,18 +301,18 @@ class TVRage:
return an exception immediately.
"""
self.shows = ShowContainer() # Holds all Show classes
self.corrections = {} # Holds show-name to show_id mapping
self.sess = requests.session() # HTTP Session
self.shows = ShowContainer() # Holds all Show classes
self.corrections = {} # Holds show-name to show_id mapping
self.sess = requests.session() # HTTP Session
self.config = {}
if apikey is not None:
self.config['apikey'] = apikey
else:
self.config['apikey'] = "Uhewg1Rr0o62fvZvUIZt" # tvdb_api's API key
self.config['apikey'] = "Uhewg1Rr0o62fvZvUIZt" # tvdb_api's API key
self.config['debug_enabled'] = debug # show debugging messages
self.config['debug_enabled'] = debug # show debugging messages
self.config['custom_ui'] = custom_ui
@ -322,8 +329,8 @@ class TVRage:
if self.config['debug_enabled']:
warnings.warn("The debug argument to tvrage_api.__init__ will be removed in the next version. "
"To enable debug messages, use the following code before importing: "
"import logging; logging.basicConfig(level=logging.DEBUG)")
"To enable debug messages, use the following code before importing: "
"import logging; logging.basicConfig(level=logging.DEBUG)")
logging.basicConfig(level=logging.DEBUG)
@ -331,8 +338,8 @@ class TVRage:
# Hard-coded here as it is realtively static, and saves another HTTP request, as
# recommended on http://tvrage.com/wiki/index.php/API:languages.xml
self.config['valid_languages'] = [
"da", "fi", "nl", "de", "it", "es", "fr","pl", "hu","el","tr",
"ru","he","ja","pt","zh","cs","sl", "hr","ko","en","sv","no"
"da", "fi", "nl", "de", "it", "es", "fr", "pl", "hu", "el", "tr",
"ru", "he", "ja", "pt", "zh", "cs", "sl", "hr", "ko", "en", "sv", "no"
]
# tvrage.com should be based around numeric language codes,
@ -340,9 +347,9 @@ class TVRage:
# requires the language ID, thus this mapping is required (mainly
# for usage in tvrage_ui - internally tvrage_api will use the language abbreviations)
self.config['langabbv_to_id'] = {'el': 20, 'en': 7, 'zh': 27,
'it': 15, 'cs': 28, 'es': 16, 'ru': 22, 'nl': 13, 'pt': 26, 'no': 9,
'tr': 21, 'pl': 18, 'fr': 17, 'hr': 31, 'de': 14, 'da': 10, 'fi': 11,
'hu': 19, 'ja': 25, 'he': 24, 'ko': 32, 'sv': 8, 'sl': 30}
'it': 15, 'cs': 28, 'es': 16, 'ru': 22, 'nl': 13, 'pt': 26, 'no': 9,
'tr': 21, 'pl': 18, 'fr': 17, 'hr': 31, 'de': 14, 'da': 10, 'fi': 11,
'hu': 19, 'ja': 25, 'he': 24, 'ko': 32, 'sv': 8, 'sl': 30}
if language is None:
self.config['language'] = 'en'
@ -390,9 +397,9 @@ class TVRage:
# get response from TVRage
if self.config['cache_enabled']:
resp = self.sess.get(url, cache_auto=True, params=params)
resp = self.sess.get(url.strip(), cache_auto=True, params=params)
else:
resp = requests.get(url, params=params)
resp = requests.get(url.strip(), params=params)
except requests.HTTPError, e:
raise tvrage_error("HTTP error " + str(e.errno) + " while loading URL " + str(url))
@ -403,81 +410,84 @@ class TVRage:
except requests.Timeout, e:
raise tvrage_error("Connection timed out " + str(e.message) + " while loading URL " + str(url))
return resp.content if resp.ok else None
def remap_keys(path, key, value):
name_map = {
'showid': 'id',
'showname': 'seriesname',
'name': 'seriesname',
'summary': 'overview',
'started': 'firstaired',
'genres': 'genre',
'airtime': 'airs_time',
'airday': 'airs_dayofweek',
'image': 'fanart',
'epnum': 'absolute_number',
'title': 'episodename',
'airdate': 'firstaired',
'screencap': 'filename',
'seasonnum': 'episodenumber'
}
try:
key = name_map[key.lower()]
except (ValueError, TypeError, KeyError):
key.lower()
# clean up value and do type changes
if value:
if isinstance(value, dict):
if key == 'network':
value = value['#text']
if key == 'genre':
value = value['genre']
if not isinstance(value, list):
value = [value]
value = '|' + '|'.join(value) + '|'
try:
# convert to integer if needed
if value.isdigit():
value = int(value)
except:
pass
try:
if key == 'firstaired' and value in "0000-00-00":
new_value = str(dt.date.fromordinal(1))
new_value = re.sub("([-]0{2}){1,}", "", new_value)
fixDate = parse(new_value, fuzzy=True).date()
value = fixDate.strftime("%Y-%m-%d")
elif key == 'firstaired':
value = parse(value, fuzzy=True).date()
value = value.strftime("%Y-%m-%d")
except:
pass
value = self._cleanData(value)
return (key, value)
if resp.ok:
return xmltodict.parse(resp.text.strip(), postprocessor=remap_keys)
def _getetsrc(self, url, params=None):
"""Loads a URL using caching, returns an ElementTree of the source
"""
reDict = {
'showid': 'id',
'showname': 'seriesname',
'name': 'seriesname',
'summary': 'overview',
'started': 'firstaired',
'genres': 'genre',
'airtime': 'airs_time',
'airday': 'airs_dayofweek',
'image': 'fanart',
'epnum': 'absolute_number',
'title': 'episodename',
'airdate': 'firstaired',
'screencap': 'filename',
'seasonnum': 'episodenumber',
}
robj = re.compile('|'.join(reDict.keys()))
src = self._loadUrl(url, params)
try:
# TVRAGE doesn't sanitize \r (CR) from user input in some fields,
# remove it to avoid errors. Change from SickBeard, from will14m
xml = ElementTree.fromstring(src.rstrip("\r"))
tree = ElementTree.ElementTree(xml)
for elm in tree.findall('.//*'):
elm.tag = robj.sub(lambda m: reDict[m.group(0)], elm.tag)
if elm.tag in 'firstaired':
try:
if elm.text in "0000-00-00":
elm.text = str(dt.date.fromordinal(1))
elm.text = re.sub("([-]0{2}){1,}", "", elm.text)
fixDate = parse(elm.text, fuzzy=True).date()
elm.text = fixDate.strftime("%Y-%m-%d")
except:
pass
return ElementTree.fromstring(ElementTree.tostring(xml))
except SyntaxError:
src = self._loadUrl(url, params)
try:
xml = ElementTree.fromstring(src.rstrip("\r"))
tree = ElementTree.ElementTree(xml)
for elm in tree.findall('.//*'):
elm.tag = robj.sub(lambda m: reDict[m.group(0)], elm.tag)
src = [src[item] for item in src][0]
except:
errormsg = "There was an error with the XML retrieved from tvrage.com"
if elm.tag in 'firstaired' and elm.text:
if elm.text == "0000-00-00":
elm.text = str(dt.date.fromordinal(1))
try:
#month = strptime(match.group('air_month')[:3],'%b').tm_mon
#day = re.sub("(st|nd|rd|th)", "", match.group('air_day'))
#dtStr = '%s/%s/%s' % (year, month, day)
fixDate = parse(elm.text, fuzzy=True)
elm.text = fixDate.strftime("%Y-%m-%d")
except:
pass
return ElementTree.fromstring(ElementTree.tostring(xml))
except SyntaxError, exceptionmsg:
errormsg = "There was an error with the XML retrieved from tvrage.com:\n%s" % (
exceptionmsg
if self.config['cache_enabled']:
errormsg += "\nFirst try emptying the cache folder at..\n%s" % (
self.config['cache_location']
)
if self.config['cache_enabled']:
errormsg += "\nFirst try emptying the cache folder at..\n%s" % (
self.config['cache_location']
)
errormsg += "\nIf this does not resolve the issue, please try again later. If the error persists, report a bug on\n"
raise tvrage_error(errormsg)
errormsg += "\nIf this does not resolve the issue, please try again later. If the error persists, report a bug on\n"
raise tvrage_error(errormsg)
return src
def _setItem(self, sid, seas, ep, attrib, value):
"""Creates a new episode, creating Show(), Season() and
@ -497,9 +507,9 @@ class TVRage:
if sid not in self.shows:
self.shows[sid] = Show()
if seas not in self.shows[sid]:
self.shows[sid][seas] = Season(show = self.shows[sid])
self.shows[sid][seas] = Season(show=self.shows[sid])
if ep not in self.shows[sid][seas]:
self.shows[sid][seas][ep] = Episode(season = self.shows[sid][seas])
self.shows[sid][seas][ep] = Episode(season=self.shows[sid][seas])
self.shows[sid][seas][ep][attrib] = value
def _setShowData(self, sid, key, value):
@ -529,9 +539,8 @@ class TVRage:
log().debug("Searching for show %s" % series)
self.config['params_getSeries']['show'] = series
seriesEt = self._getetsrc(self.config['url_getSeries'], self.config['params_getSeries'])
allSeries = list(dict((s.tag.lower(),s.text) for s in x.getchildren()) for x in seriesEt)
return allSeries
return [seriesEt[item] for item in seriesEt][0]
def _getSeries(self, series):
"""This searches tvrage.com for the series name,
@ -547,10 +556,10 @@ class TVRage:
if self.config['custom_ui'] is not None:
log().debug("Using custom UI %s" % (repr(self.config['custom_ui'])))
ui = self.config['custom_ui'](config = self.config)
ui = self.config['custom_ui'](config=self.config)
else:
log().debug('Auto-selecting first search result using BaseUI')
ui = BaseUI(config = self.config)
ui = BaseUI(config=self.config)
return ui.selectSeries(allSeries)
@ -568,62 +577,49 @@ class TVRage:
self.config['params_seriesInfo']
)
if seriesInfoEt is None: return False
for curInfo in seriesInfoEt:
tag = curInfo.tag.lower()
value = curInfo.text
# check and make sure we have data to process and that it contains a series name
if seriesInfoEt is None or 'seriesname' not in seriesInfoEt:
return False
if tag == 'seriesname' and value is None:
return False
for k, v in seriesInfoEt.items():
self._setShowData(sid, k, v)
if tag == 'id':
value = int(value)
if value is not None:
value = self._cleanData(value)
self._setShowData(sid, tag, value)
if seriesSearch: return True
try:
# Parse genre data
log().debug('Getting genres of %s' % (sid))
for genre in seriesInfoEt.find('genres'):
tag = genre.tag.lower()
value = genre.text
if value is not None:
value = self._cleanData(value)
self._setShowData(sid, tag, value)
except Exception:
log().debug('No genres for %s' % (sid))
# series search ends here
if seriesSearch:
return True
# Parse episode data
log().debug('Getting all episodes of %s' % (sid))
self.config['params_epInfo']['sid'] = sid
epsEt = self._getetsrc(self.config['url_epInfo'], self.config['params_epInfo'])
for cur_list in epsEt.findall("Episodelist"):
for cur_seas in cur_list:
try:
seas_no = int(cur_seas.attrib['no'])
for cur_ep in cur_seas:
ep_no = int(cur_ep.find('episodenumber').text)
self._setItem(sid, seas_no, ep_no, 'seasonnumber', seas_no)
for cur_item in cur_ep:
tag = cur_item.tag.lower()
value = cur_item.text
if value is not None:
if tag == 'id':
value = int(value)
for season in epsEt['Episodelist']['Season']:
episodes = season['episode']
if not isinstance(episodes, list):
episodes = [episodes]
value = self._cleanData(value)
for episode in episodes:
seas_no = int(season['@no'])
ep_no = int(episode['episodenumber'])
self._setItem(sid, seas_no, ep_no, 'seasonnumber', seas_no)
self._setItem(sid, seas_no, ep_no, tag, value)
except:
continue
for k,v in episode.items():
try:
k = k.lower()
if v is not None:
if k == 'link':
v = v.rsplit('/', 1)[1]
k = 'id'
if k == 'id':
v = int(v)
v = self._cleanData(v)
self._setItem(sid, seas_no, ep_no, k, v)
except:
continue
return True
def _nameToSid(self, name):
@ -632,7 +628,7 @@ class TVRage:
the correct SID.
"""
if name in self.corrections:
log().debug('Correcting %s to %s' % (name, self.corrections[name]) )
log().debug('Correcting %s to %s' % (name, self.corrections[name]))
return self.corrections[name]
else:
log().debug('Getting show %s' % (name))
@ -673,11 +669,13 @@ def main():
grabs an episode name interactively.
"""
import logging
logging.basicConfig(level=logging.DEBUG)
tvrage_instance = TVRage(cache=False)
print tvrage_instance['Lost']['seriesname']
print tvrage_instance['Lost'][1][4]['episodename']
if __name__ == '__main__':
main()

359
lib/xmltodict.py Normal file
View File

@ -0,0 +1,359 @@
#!/usr/bin/env python
"Makes working with XML feel like you are working with JSON"
from xml.parsers import expat
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import AttributesImpl
try: # pragma no cover
from cStringIO import StringIO
except ImportError: # pragma no cover
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
try: # pragma no cover
from collections import OrderedDict
except ImportError: # pragma no cover
try:
from ordereddict import OrderedDict
except ImportError:
OrderedDict = dict
try: # pragma no cover
_basestring = basestring
except NameError: # pragma no cover
_basestring = str
try: # pragma no cover
_unicode = unicode
except NameError: # pragma no cover
_unicode = str
__author__ = 'Martin Blech'
__version__ = '0.9.0'
__license__ = 'MIT'
class ParsingInterrupted(Exception):
pass
class _DictSAXHandler(object):
def __init__(self,
item_depth=0,
item_callback=lambda *args: True,
xml_attribs=True,
attr_prefix='@',
cdata_key='#text',
force_cdata=False,
cdata_separator='',
postprocessor=None,
dict_constructor=OrderedDict,
strip_whitespace=True,
namespace_separator=':',
namespaces=None):
self.path = []
self.stack = []
self.data = None
self.item = None
self.item_depth = item_depth
self.xml_attribs = xml_attribs
self.item_callback = item_callback
self.attr_prefix = attr_prefix
self.cdata_key = cdata_key
self.force_cdata = force_cdata
self.cdata_separator = cdata_separator
self.postprocessor = postprocessor
self.dict_constructor = dict_constructor
self.strip_whitespace = strip_whitespace
self.namespace_separator = namespace_separator
self.namespaces = namespaces
def _build_name(self, full_name):
if not self.namespaces:
return full_name
i = full_name.rfind(self.namespace_separator)
if i == -1:
return full_name
namespace, name = full_name[:i], full_name[i+1:]
short_namespace = self.namespaces.get(namespace, namespace)
if not short_namespace:
return name
else:
return self.namespace_separator.join((short_namespace, name))
def _attrs_to_dict(self, attrs):
if isinstance(attrs, dict):
return attrs
return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
def startElement(self, full_name, attrs):
name = self._build_name(full_name)
attrs = self._attrs_to_dict(attrs)
self.path.append((name, attrs or None))
if len(self.path) > self.item_depth:
self.stack.append((self.item, self.data))
if self.xml_attribs:
attrs = self.dict_constructor(
(self.attr_prefix+key, value)
for (key, value) in attrs.items())
else:
attrs = None
self.item = attrs or None
self.data = None
def endElement(self, full_name):
name = self._build_name(full_name)
if len(self.path) == self.item_depth:
item = self.item
if item is None:
item = self.data
should_continue = self.item_callback(self.path, item)
if not should_continue:
raise ParsingInterrupted()
if len(self.stack):
item, data = self.item, self.data
self.item, self.data = self.stack.pop()
if self.strip_whitespace and data is not None:
data = data.strip() or None
if data and self.force_cdata and item is None:
item = self.dict_constructor()
if item is not None:
if data:
self.push_data(item, self.cdata_key, data)
self.item = self.push_data(self.item, name, item)
else:
self.item = self.push_data(self.item, name, data)
else:
self.item = self.data = None
self.path.pop()
def characters(self, data):
if not self.data:
self.data = data
else:
self.data += self.cdata_separator + data
def push_data(self, item, key, data):
if self.postprocessor is not None:
result = self.postprocessor(self.path, key, data)
if result is None:
return item
key, data = result
if item is None:
item = self.dict_constructor()
try:
value = item[key]
if isinstance(value, list):
value.append(data)
else:
item[key] = [value, data]
except KeyError:
item[key] = data
return item
def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,
namespace_separator=':', **kwargs):
"""Parse the given XML input and convert it into a dictionary.
`xml_input` can either be a `string` or a file-like object.
If `xml_attribs` is `True`, element attributes are put in the dictionary
among regular child elements, using `@` as a prefix to avoid collisions. If
set to `False`, they are just ignored.
Simple example::
>>> import xmltodict
>>> doc = xmltodict.parse(\"\"\"
... <a prop="x">
... <b>1</b>
... <b>2</b>
... </a>
... \"\"\")
>>> doc['a']['@prop']
u'x'
>>> doc['a']['b']
[u'1', u'2']
If `item_depth` is `0`, the function returns a dictionary for the root
element (default behavior). Otherwise, it calls `item_callback` every time
an item at the specified depth is found and returns `None` in the end
(streaming mode).
The callback function receives two parameters: the `path` from the document
root to the item (name-attribs pairs), and the `item` (dict). If the
callback's return value is false-ish, parsing will be stopped with the
:class:`ParsingInterrupted` exception.
Streaming example::
>>> def handle(path, item):
... print 'path:%s item:%s' % (path, item)
... return True
...
>>> xmltodict.parse(\"\"\"
... <a prop="x">
... <b>1</b>
... <b>2</b>
... </a>\"\"\", item_depth=2, item_callback=handle)
path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2
The optional argument `postprocessor` is a function that takes `path`,
`key` and `value` as positional arguments and returns a new `(key, value)`
pair where both `key` and `value` may have changed. Usage example::
>>> def postprocessor(path, key, value):
... try:
... return key + ':int', int(value)
... except (ValueError, TypeError):
... return key, value
>>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
... postprocessor=postprocessor)
OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))])
You can pass an alternate version of `expat` (such as `defusedexpat`) by
using the `expat` parameter. E.g:
>>> import defusedexpat
>>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
OrderedDict([(u'a', u'hello')])
"""
handler = _DictSAXHandler(namespace_separator=namespace_separator,
**kwargs)
if isinstance(xml_input, _unicode):
if not encoding:
encoding = 'utf-8'
xml_input = xml_input.encode(encoding)
if not process_namespaces:
namespace_separator = None
parser = expat.ParserCreate(
encoding,
namespace_separator
)
try:
parser.ordered_attributes = True
except AttributeError:
# Jython's expat does not support ordered_attributes
pass
parser.StartElementHandler = handler.startElement
parser.EndElementHandler = handler.endElement
parser.CharacterDataHandler = handler.characters
parser.buffer_text = True
try:
parser.ParseFile(xml_input)
except (TypeError, AttributeError):
parser.Parse(xml_input, True)
return handler.item
def _emit(key, value, content_handler,
attr_prefix='@',
cdata_key='#text',
depth=0,
preprocessor=None,
pretty=False,
newl='\n',
indent='\t'):
if preprocessor is not None:
result = preprocessor(key, value)
if result is None:
return
key, value = result
if not isinstance(value, (list, tuple)):
value = [value]
if depth == 0 and len(value) > 1:
raise ValueError('document with multiple roots')
for v in value:
if v is None:
v = OrderedDict()
elif not isinstance(v, dict):
v = _unicode(v)
if isinstance(v, _basestring):
v = OrderedDict(((cdata_key, v),))
cdata = None
attrs = OrderedDict()
children = []
for ik, iv in v.items():
if ik == cdata_key:
cdata = iv
continue
if ik.startswith(attr_prefix):
attrs[ik[len(attr_prefix):]] = iv
continue
children.append((ik, iv))
if pretty:
content_handler.ignorableWhitespace(depth * indent)
content_handler.startElement(key, AttributesImpl(attrs))
if pretty and children:
content_handler.ignorableWhitespace(newl)
for child_key, child_value in children:
_emit(child_key, child_value, content_handler,
attr_prefix, cdata_key, depth+1, preprocessor,
pretty, newl, indent)
if cdata is not None:
content_handler.characters(cdata)
if pretty and children:
content_handler.ignorableWhitespace(depth * indent)
content_handler.endElement(key)
if pretty and depth:
content_handler.ignorableWhitespace(newl)
def unparse(input_dict, output=None, encoding='utf-8', full_document=True,
**kwargs):
"""Emit an XML document for the given `input_dict` (reverse of `parse`).
The resulting XML document is returned as a string, but if `output` (a
file-like object) is specified, it is written there instead.
Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
as XML node attributes, whereas keys equal to `cdata_key`
(default=`'#text'`) are treated as character data.
The `pretty` parameter (default=`False`) enables pretty-printing. In this
mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
can be customized with the `newl` and `indent` parameters.
"""
((key, value),) = input_dict.items()
must_return = False
if output is None:
output = StringIO()
must_return = True
content_handler = XMLGenerator(output, encoding)
if full_document:
content_handler.startDocument()
_emit(key, value, content_handler, **kwargs)
if full_document:
content_handler.endDocument()
if must_return:
value = output.getvalue()
try: # pragma no cover
value = value.decode(encoding)
except AttributeError: # pragma no cover
pass
return value
if __name__ == '__main__': # pragma: no cover
import sys
import marshal
(item_depth,) = sys.argv[1:]
item_depth = int(item_depth)
def handle_item(path, item):
marshal.dump((path, item), sys.stdout)
return True
try:
root = parse(sys.stdin,
item_depth=item_depth,
item_callback=handle_item,
dict_constructor=dict)
if item_depth == 0:
handle_item([], root)
except KeyboardInterrupt:
pass

View File

@ -782,14 +782,10 @@ class GenericMetadata():
# Try and get posters and fanart from TMDB
if image_url is None:
for show_name in set(allPossibleShowNames(show_obj)):
if image_type in ('poster', 'poster_thumb'):
image_url = self._retrieve_show_images_from_tmdb(show_obj, poster=True)
elif image_type == 'fanart':
image_url = self._retrieve_show_images_from_tmdb(show_obj, backdrop=True)
if image_url:
break
if image_type in ('poster', 'poster_thumb'):
image_url = self._retrieve_show_images_from_tmdb(show_obj, poster=True)
elif image_type == 'fanart':
image_url = self._retrieve_show_images_from_tmdb(show_obj, backdrop=True)
if image_url:
image_data = metadata_helpers.getShowImage(image_url, which)
@ -965,8 +961,6 @@ class GenericMetadata():
return (indexer_id, name, indexer)
def _retrieve_show_images_from_tmdb(self, show, backdrop=False, poster=False):
tmdb_id = None
# get TMDB configuration info
tmdb = TMDB(sickbeard.TMDB_API_KEY)
config = tmdb.Configuration()
@ -981,27 +975,14 @@ class GenericMetadata():
try:
search = tmdb.Search()
for result in search.collection({'query': show.name}) + search.tv({'query': show.name}):
tmdb_id = result['id']
external_ids = tmdb.TV(tmdb_id).external_ids()
if show.indexerid in [external_ids['tvdb_id'], external_ids['tvrage_id']]:
break
for show_name in set(allPossibleShowNames(show)):
for result in search.collection({'query': show_name})['results'] + search.tv({'query': show_name})['results']:
if backdrop and result['backdrop_path']:
return "{0}{1}{2}".format(base_url, max_size, result['backdrop_path'])
elif poster and result['poster_path']:
return "{0}{1}{2}".format(base_url, max_size, result['poster_path'])
if tmdb_id:
images = tmdb.Collections(tmdb_id).images()
if len(images) > 0:
# get backdrop urls
if backdrop:
rel_path = images['backdrops'][0]['file_path']
url = "{0}{1}{2}".format(base_url, max_size, rel_path)
return url
# get poster urls
if poster:
rel_path = images['posters'][0]['file_path']
url = "{0}{1}{2}".format(base_url, max_size, rel_path)
return url
except:
except Exception, e:
pass
logger.log(u"Could not find any posters or background for " + show.name, logger.DEBUG)

View File

@ -829,7 +829,7 @@ class TVShow(object):
self.airs = myEp["airs_dayofweek"] + " " + myEp["airs_time"]
if getattr(myEp, 'firstaired', None) is not None:
self.startyear = int(myEp["firstaired"].split('-')[0])
self.startyear = int(str(myEp["firstaired"]).split('-')[0])
self.status = getattr(myEp, 'status', '')
@ -855,7 +855,6 @@ class TVShow(object):
i = imdb.IMDb()
imdbTv = i.get_movie(str(re.sub("[^0-9]", "", self.imdbid)))
test = imdbTv.keys()
for key in filter(lambda x: x.replace('_', ' ') in imdbTv.keys(), imdb_info.keys()):
# Store only the first value for string type
if type(imdb_info[key]) == type('') and type(imdbTv.get(key)) == type([]):
@ -1556,7 +1555,7 @@ class TVEpisode(object):
self.deleteEpisode()
return False
if myEp["absolute_number"] == None or myEp["absolute_number"] == "":
if getattr(myEp, 'absolute_number', None) is None:
logger.log(u"This episode (" + self.show.name + " - " + str(season) + "x" + str(
episode) + ") has no absolute number on " + sickbeard.indexerApi(
self.indexer).name
@ -1564,7 +1563,7 @@ class TVEpisode(object):
else:
logger.log(
str(self.show.indexerid) + ": The absolute_number for " + str(season) + "x" + str(episode) + " is : " +
myEp["absolute_number"], logger.DEBUG)
str(myEp["absolute_number"]), logger.DEBUG)
self.absolute_number = int(myEp["absolute_number"])
self.name = getattr(myEp, 'episodename', "")
@ -1603,8 +1602,9 @@ class TVEpisode(object):
u"The show dir is missing, not bothering to change the episode statuses since it'd probably be invalid")
return
logger.log(str(self.show.indexerid) + u": Setting status for " + str(season) + "x" + str(
episode) + " based on status " + str(self.status) + " and existence of " + self.location, logger.DEBUG)
if self.location:
logger.log(str(self.show.indexerid) + u": Setting status for " + str(season) + "x" + str(
episode) + " based on status " + str(self.status) + " and existence of " + self.location, logger.DEBUG)
if not ek.ek(os.path.isfile, self.location):