1
0
mirror of https://github.com/moparisthebest/SickRage synced 2024-10-31 15:35:01 -04:00

Update imdbpy libs to v5.0

Fixed invalid indexer id issues for TVRage shows.

Fixed issues for getting posters and backdrops for TVRage shows.

We now convert XML straight to a dict object for Indexer APIs, improved overall performance api's

Fixed issues with TVRage shows and displaying genre's properly.
This commit is contained in:
echel0n 2014-05-28 22:40:12 -07:00
parent 764cf6e62e
commit 2dcd26e69c
30 changed files with 7446 additions and 453 deletions

View File

@ -6,7 +6,7 @@ a person from the IMDb database.
It can fetch data through different media (e.g.: the IMDb web pages, It can fetch data through different media (e.g.: the IMDb web pages,
a SQL database, etc.) a SQL database, etc.)
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it> Copyright 2004-2014 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -25,7 +25,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
__all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company', __all__ = ['IMDb', 'IMDbError', 'Movie', 'Person', 'Character', 'Company',
'available_access_systems'] 'available_access_systems']
__version__ = VERSION = '4.9' __version__ = VERSION = '5.0'
# Import compatibility module (importing it is enough). # Import compatibility module (importing it is enough).
import _compat import _compat
@ -160,6 +160,7 @@ def IMDb(accessSystem=None, *arguments, **keywords):
kwds.update(keywords) kwds.update(keywords)
keywords = kwds keywords = kwds
except Exception, e: except Exception, e:
import logging
logging.getLogger('imdbpy').warn('Unable to read configuration' \ logging.getLogger('imdbpy').warn('Unable to read configuration' \
' file; complete error: %s' % e) ' file; complete error: %s' % e)
# It just LOOKS LIKE a bad habit: we tried to read config # It just LOOKS LIKE a bad habit: we tried to read config
@ -303,7 +304,7 @@ class IMDbBase:
# http://akas.imdb.com/keyword/%s/ # http://akas.imdb.com/keyword/%s/
imdbURL_keyword_main=imdbURL_base + 'keyword/%s/' imdbURL_keyword_main=imdbURL_base + 'keyword/%s/'
# http://akas.imdb.com/chart/top # http://akas.imdb.com/chart/top
imdbURL_top250=imdbURL_base + 'chart/top', imdbURL_top250=imdbURL_base + 'chart/top'
# http://akas.imdb.com/chart/bottom # http://akas.imdb.com/chart/bottom
imdbURL_bottom100=imdbURL_base + 'chart/bottom' imdbURL_bottom100=imdbURL_base + 'chart/bottom'
# http://akas.imdb.com/find?%s # http://akas.imdb.com/find?%s
@ -824,22 +825,23 @@ class IMDbBase:
# subclass, somewhere under the imdb.parser package. # subclass, somewhere under the imdb.parser package.
raise NotImplementedError('override this method') raise NotImplementedError('override this method')
def _searchIMDb(self, kind, ton): def _searchIMDb(self, kind, ton, title_kind=None):
"""Search the IMDb akas server for the given title or name.""" """Search the IMDb akas server for the given title or name."""
# The Exact Primary search system has gone AWOL, so we resort # The Exact Primary search system has gone AWOL, so we resort
# to the mobile search. :-/ # to the mobile search. :-/
if not ton: if not ton:
return None return None
ton = ton.strip('"')
aSystem = IMDb('mobile') aSystem = IMDb('mobile')
if kind == 'tt': if kind == 'tt':
searchFunct = aSystem.search_movie searchFunct = aSystem.search_movie
check = 'long imdb canonical title' check = 'long imdb title'
elif kind == 'nm': elif kind == 'nm':
searchFunct = aSystem.search_person searchFunct = aSystem.search_person
check = 'long imdb canonical name' check = 'long imdb name'
elif kind == 'char': elif kind == 'char':
searchFunct = aSystem.search_character searchFunct = aSystem.search_character
check = 'long imdb canonical name' check = 'long imdb name'
elif kind == 'co': elif kind == 'co':
# XXX: are [COUNTRY] codes included in the results? # XXX: are [COUNTRY] codes included in the results?
searchFunct = aSystem.search_company searchFunct = aSystem.search_company
@ -852,24 +854,42 @@ class IMDbBase:
# exact match. # exact match.
if len(searchRes) == 1: if len(searchRes) == 1:
return searchRes[0].getID() return searchRes[0].getID()
title_only_matches = []
for item in searchRes: for item in searchRes:
# Return the first perfect match. # Return the first perfect match.
if item[check] == ton: if item[check].strip('"') == ton:
# For titles do additional check for kind
if kind != 'tt' or title_kind == item['kind']:
return item.getID() return item.getID()
elif kind == 'tt':
title_only_matches.append(item.getID())
# imdbpy2sql.py could detected wrong type, so if no title and kind
# matches found - collect all results with title only match
# Return list of IDs if multiple matches (can happen when searching
# titles with no title_kind specified)
# Example: DB: Band of Brothers "tv series" vs "tv mini-series"
if title_only_matches:
if len(title_only_matches) == 1:
return title_only_matches[0]
else:
return title_only_matches
return None return None
def title2imdbID(self, title): def title2imdbID(self, title, kind=None):
"""Translate a movie title (in the plain text data files format) """Translate a movie title (in the plain text data files format)
to an imdbID. to an imdbID.
Try an Exact Primary Title search on IMDb; Try an Exact Primary Title search on IMDb;
return None if it's unable to get the imdbID.""" return None if it's unable to get the imdbID;
return self._searchIMDb('tt', title) Always specify kind: movie, tv series, video game etc. or search can
return list of IDs if multiple matches found
"""
return self._searchIMDb('tt', title, kind)
def name2imdbID(self, name): def name2imdbID(self, name):
"""Translate a person name in an imdbID. """Translate a person name in an imdbID.
Try an Exact Primary Name search on IMDb; Try an Exact Primary Name search on IMDb;
return None if it's unable to get the imdbID.""" return None if it's unable to get the imdbID."""
return self._searchIMDb('tt', name) return self._searchIMDb('nm', name)
def character2imdbID(self, name): def character2imdbID(self, name):
"""Translate a character name in an imdbID. """Translate a character name in an imdbID.
@ -896,7 +916,8 @@ class IMDbBase:
imdbID = aSystem.get_imdbMovieID(mop.movieID) imdbID = aSystem.get_imdbMovieID(mop.movieID)
else: else:
imdbID = aSystem.title2imdbID(build_title(mop, canonical=0, imdbID = aSystem.title2imdbID(build_title(mop, canonical=0,
ptdf=1)) ptdf=0, appendKind=False),
mop['kind'])
elif isinstance(mop, Person.Person): elif isinstance(mop, Person.Person):
if mop.personID is not None: if mop.personID is not None:
imdbID = aSystem.get_imdbPersonID(mop.personID) imdbID = aSystem.get_imdbPersonID(mop.personID)

View File

@ -29,7 +29,7 @@
[imdbpy] [imdbpy]
## Default. ## Default.
accessSystem = mobile accessSystem = http
## Optional (options common to every data access system): ## Optional (options common to every data access system):
# Activate adult searches (on, by default). # Activate adult searches (on, by default).
@ -37,7 +37,7 @@ accessSystem = mobile
# Number of results for searches (20 by default). # Number of results for searches (20 by default).
#results = 20 #results = 20
# Re-raise all caught exceptions (off, by default). # Re-raise all caught exceptions (off, by default).
reraiseExceptions = on #reraiseExceptions = off
## Optional (options common to http and mobile data access systems): ## Optional (options common to http and mobile data access systems):
# Proxy used to access the network. If it requires authentication, # Proxy used to access the network. If it requires authentication,
@ -69,7 +69,7 @@ reraiseExceptions = on
## Set the threshold for logging messages. ## Set the threshold for logging messages.
# Can be one of "debug", "info", "warning", "error", "critical" (default: # Can be one of "debug", "info", "warning", "error", "critical" (default:
# "warning"). # "warning").
loggingLevel = info #loggingLevel = debug
## Path to a configuration file for the logging facility; ## Path to a configuration file for the logging facility;
# see: http://docs.python.org/library/logging.html#configuring-logging # see: http://docs.python.org/library/logging.html#configuring-logging

View File

@ -64,8 +64,10 @@ LANG_ARTICLES = {
'English': ('the', 'a', 'an'), 'English': ('the', 'a', 'an'),
'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'", 'Italian': ('la', 'le', "l'", 'il', 'i', 'un', 'una', 'gli', 'lo', "un'",
'uno'), 'uno'),
'Spanish': ('la', 'le', 'el', 'les', 'un', 'los', 'una', 'uno', 'unos', 'Spanish': ('la', 'lo', 'el', 'las', 'un', 'los', 'una', 'al', 'del',
'unas'), 'unos', 'unas', 'uno'),
'French': ('le', "l'", 'la', 'les', 'un', 'une', 'des', 'au', 'du', '\xc3\xa0 la',
'de la', 'aux'),
'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'), 'Portuguese': ('a', 'as', 'o', 'os', 'um', 'uns', 'uma', 'umas'),
'Turkish': (), # Some languages doesn't have articles. 'Turkish': (), # Some languages doesn't have articles.
} }

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python2 #!/usr/bin/env python
""" """
generatepot.py script. generatepot.py script.

1303
lib/imdb/locale/imdbpy-ar.po Normal file

File diff suppressed because it is too large Load Diff

1303
lib/imdb/locale/imdbpy-bg.po Normal file

File diff suppressed because it is too large Load Diff

1303
lib/imdb/locale/imdbpy-de.po Normal file

File diff suppressed because it is too large Load Diff

1304
lib/imdb/locale/imdbpy-es.po Normal file

File diff suppressed because it is too large Load Diff

1304
lib/imdb/locale/imdbpy-fr.po Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python2 #!/usr/bin/env python
# -*- coding: iso-8859-1 -*- # -*- coding: iso-8859-1 -*-
"""Generate binary message catalog from textual translation description. """Generate binary message catalog from textual translation description.

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python2 #!/usr/bin/env python
""" """
rebuildmo.py script. rebuildmo.py script.

View File

@ -104,15 +104,24 @@ PY_VERSION = sys.version_info[:2]
# The cookies for the "adult" search. # The cookies for the "adult" search.
# Please don't mess with these account. # Please don't mess with these account.
# Old 'IMDbPY' account. # Old 'IMDbPY' account.
_old_cookie_id = 'boM2bYxz9MCsOnH9gZ0S9QHs12NWrNdApxsls1Vb5/NGrNdjcHx3dUas10UASoAjVEvhAbGagERgOpNkAPvxdbfKwaV2ikEj9SzXY1WPxABmDKQwdqzwRbM+12NSeJFGUEx3F8as10WwidLzVshDtxaPIbP13NdjVS9UZTYqgTVGrNcT9vyXU1' _IMDbPY_cookie_id = 'boM2bYxz9MCsOnH9gZ0S9QHs12NWrNdApxsls1Vb5/NGrNdjcHx3dUas10UASoAjVEvhAbGagERgOpNkAPvxdbfKwaV2ikEj9SzXY1WPxABmDKQwdqzwRbM+12NSeJFGUEx3F8as10WwidLzVshDtxaPIbP13NdjVS9UZTYqgTVGrNcT9vyXU1'
_old_cookie_uu = '3M3AXsquTU5Gur/Svik+ewflPm5Rk2ieY3BIPlLjyK3C0Dp9F8UoPgbTyKiGtZp4x1X+uAUGKD7BM2g+dVd8eqEzDErCoYvdcvGLvVLAen1y08hNQtALjVKAe+1hM8g9QbNonlG1/t4S82ieUsBbrSIQbq1yhV6tZ6ArvSbA7rgHc8n5AdReyAmDaJ5Wm/ee3VDoCnGj/LlBs2ieUZNorhHDKK5Q==' _IMDbPY_cookie_uu = '3M3AXsquTU5Gur/Svik+ewflPm5Rk2ieY3BIPlLjyK3C0Dp9F8UoPgbTyKiGtZp4x1X+uAUGKD7BM2g+dVd8eqEzDErCoYvdcvGLvVLAen1y08hNQtALjVKAe+1hM8g9QbNonlG1/t4S82ieUsBbrSIQbq1yhV6tZ6ArvSbA7rgHc8n5AdReyAmDaJ5Wm/ee3VDoCnGj/LlBs2ieUZNorhHDKK5Q=='
# New 'IMDbPYweb' account. # 'imdbpy2010' account.
_cookie_id = 'rH1jNAkjTlNXvHolvBVBsgaPICNZbNdjVjzFwzas9JRmusdjVoqBs/Hs12NR+1WFxEoR9bGKEDUg6sNlADqXwkas12N131Rwdb+UQNGKN8PWrNdjcdqBQVLq8mbGDHP3hqzxhbD692NQi9D0JjpBtRaPIbP1zNdjUOqENQYv1ADWrNcT9vyXU1' _imdbpy2010_cookie_id = 'QrCdxVi+L+WgqOLrQJJgBgRRXGInphxiBPU/YXSFDyExMFzCp6YcYgSVXyEUhS/xMID8wqemHGID4DlntwZ49vemP5UXsAxiJ4D6goSmHGIgNT9hMXBaRSF2vMS3phxB0bVfQiQlP1RxdrzhB6YcRHFASyIhQVowwXCKtDSlD2YhgRvxBsCKtGemHBKH9mxSI='
_cookie_uu = 'su4/m8cho4c6HP+W1qgq6wchOmhnF0w+lIWvHjRUPJ6nRA9sccEafjGADJ6hQGrMd4GKqLcz2X4z5+w+M4OIKnRn7FpENH7dxDQu3bQEHyx0ZEyeRFTPHfQEX03XF+yeN1dsPpcXaqjUZAw+lGRfXRQEfz3RIX9IgVEffdBAHw2wQXyf9xdMPrQELw0QNB8dsffsqcdQemjPB0w+moLcPh0JrKrHJ9hjBzdMPpcXTH7XRwwOk=' _imdbpy2010_cookie_uu = 'oiEo2yoJFCA2Zbn/o7Z1LAPIwotAu6QdALv3foDb1x5F/tdrFY63XkSfty4kntS8Y8jkHSDLt3406+d+JThEilPI0mtTaOQdA/t2/iErp22jaLdeVU5ya4PIREpj7HFdpzhEHadcIAngSER50IoHDpD6Bz4Qy3b+UIhE/hBbhz5Q63ceA2hEvhPo5B0FnrL9Q8jkWjDIbA0Au3d+AOtnXoCIRL4Q28c+UOtnXpP4RL4T6OQdA+6ijUCI5B0AW2d+UOtnXpPYRL4T6OQdA8jkTUOYlC0A=='
# old 'IMDbPYweb' account.
_old_IMDbPYweb_cookie_id = 'rH1jNAkjTlNXvHolvBVBsgaPICNZbNdjVjzFwzas9JRmusdjVoqBs/Hs12NR+1WFxEoR9bGKEDUg6sNlADqXwkas12N131Rwdb+UQNGKN8PWrNdjcdqBQVLq8mbGDHP3hqzxhbD692NQi9D0JjpBtRaPIbP1zNdjUOqENQYv1ADWrNcT9vyXU1'
_old_IMDbPYweb_cookie_uu = 'su4/m8cho4c6HP+W1qgq6wchOmhnF0w+lIWvHjRUPJ6nRA9sccEafjGADJ6hQGrMd4GKqLcz2X4z5+w+M4OIKnRn7FpENH7dxDQu3bQEHyx0ZEyeRFTPHfQEX03XF+yeN1dsPpcXaqjUZAw+lGRfXRQEfz3RIX9IgVEffdBAHw2wQXyf9xdMPrQELw0QNB8dsffsqcdQemjPB0w+moLcPh0JrKrHJ9hjBzdMPpcXTH7XRwwOk='
# old 'IMDbPYweb' account values (as of 2012-12-30)
_IMDbPYweb_cookie_id = 'BCYjtpb46Go0cMHAMewWZEauhwqPL7ASCPpPVNutu6BuayHZd0U6Dk3UAqVlEM8DHLDsSr02RGQn5ff3245-R4A130NAWJ_5yqXx7X-zJey8vQM8JKdv3rTUSEJznJQlojUW1Bije-Q0FXAixs4I0sePWhd_tA41i-9AF2q3lPmaksram6ilMhN9i3IPESW1PMbk'
_IMDbPYweb_cookie_uu = 'BCYttQjEMc-NyUdFUGxThidAnBo7wwalEzj4un9uzf2XoEjtqDhNfrH7bOSuwlRkMEQ11SNyTajl-b9Q-21m4HwYu0e3jXZrjYLXLYzFkrEroCDyUREqaTwPJPSjGtFmvlaVBZEZmsWpaxe18DT5KiygKyGPZKH78Xu4im6ba-Sd31WvbXHzP8KGXPpGjhhVuv7Dcv314HCWkE832Srf9ya-Uv0FdGAmYyLbIAXuxnvpYQd6oZ8-CYkSGLIqcKWdrf5S'
# 'IMDbPY2013' account
_IMDbPY2013_cookie_id = 'BCYmoyqSm2WglmOzG-SrFWSvVpxsTZOB0qEOOqmAwCBxCbaNgKOxd0DTKzUvt7t04Pya5gV2tUrpDmYxrc1Dr54DQj2UXI7QI35__M5-HI2KrbOI3PjDz6M-_U3HG8topMfN64R24tmBixoZhMYXVaEc556lf0Z4gQNJVYRANXvwytP5v1lpfeToRlu9aVJwN4kT'
_IMDbPY2013_cookie_uu = 'BCYquDS8Y2i8R1pJxS4nB77YrhjHHXeOea2Xl9KtZvE6RZKVfMvzTGU4Vl5-yxfPbgRSiFJasyf-hhPuVvXyaHlfeBjNlbFT8hz2HzFFkQ_SxKxq05J51gi7Fv4SaAws1M-i7zmQ1TRunfJqCVIYqPwIs2NO7s4_YDH2ZoISVGLgca8OY2K58HychOZB1oRWHVeAJNhLJMrCWJBuGRLCNnQK5X9tA0dPPntr2Ussy0ouul-N1GQz-8y5vda3JJ_C6xkwmHcA6JrOdOFO_HqMWjVSXuxGEdrXC919JM9H0vooVvKeVgAEJnTh2GiVlUJUoH3c'
# imdbpy2010 account. # Currently used account.
#_cookie_id = 'QrCdxVi+L+WgqOLrQJJgBgRRXGInphxiBPU/YXSFDyExMFzCp6YcYgSVXyEUhS/xMID8wqemHGID4DlntwZ49vemP5UXsAxiJ4D6goSmHGIgNT9hMXBaRSF2vMS3phxB0bVfQiQlP1RxdrzhB6YcRHFASyIhQVowwXCKtDSlD2YhgRvxBsCKtGemHBKH9mxSI=' _cookie_id = _IMDbPY2013_cookie_id
#_cookie_uu = 'oiEo2yoJFCA2Zbn/o7Z1LAPIwotAu6QdALv3foDb1x5F/tdrFY63XkSfty4kntS8Y8jkHSDLt3406+d+JThEilPI0mtTaOQdA/t2/iErp22jaLdeVU5ya4PIREpj7HFdpzhEHadcIAngSER50IoHDpD6Bz4Qy3b+UIhE/hBbhz5Q63ceA2hEvhPo5B0FnrL9Q8jkWjDIbA0Au3d+AOtnXoCIRL4Q28c+UOtnXpP4RL4T6OQdA+6ijUCI5B0AW2d+UOtnXpPYRL4T6OQdA8jkTUOYlC0A==' _cookie_uu = _IMDbPY2013_cookie_uu
class _FakeURLOpener(object): class _FakeURLOpener(object):
@ -141,9 +150,10 @@ class IMDbURLopener(FancyURLopener):
for header in ('User-Agent', 'User-agent', 'user-agent'): for header in ('User-Agent', 'User-agent', 'user-agent'):
self.del_header(header) self.del_header(header)
self.set_header('User-Agent', 'Mozilla/5.0') self.set_header('User-Agent', 'Mozilla/5.0')
self.set_header('Accept-Language', 'en-us,en;q=0.5')
# XXX: This class is used also to perform "Exact Primary # XXX: This class is used also to perform "Exact Primary
# [Title|Name]" searches, and so by default the cookie is set. # [Title|Name]" searches, and so by default the cookie is set.
c_header = 'id=%s; uu=%s' % (_cookie_id, _cookie_uu) c_header = 'uu=%s; id=%s' % (_cookie_uu, _cookie_id)
self.set_header('Cookie', c_header) self.set_header('Cookie', c_header)
def get_proxy(self): def get_proxy(self):
@ -199,12 +209,11 @@ class IMDbURLopener(FancyURLopener):
server_encode = uopener.info().getparam('charset') server_encode = uopener.info().getparam('charset')
# Otherwise, look at the content-type HTML meta tag. # Otherwise, look at the content-type HTML meta tag.
if server_encode is None and content: if server_encode is None and content:
first_bytes = content[:512] begin_h = content.find('text/html; charset=')
begin_h = first_bytes.find('text/html; charset=')
if begin_h != -1: if begin_h != -1:
end_h = first_bytes[19+begin_h:].find('"') end_h = content[19+begin_h:].find('"')
if end_h != -1: if end_h != -1:
server_encode = first_bytes[19+begin_h:19+begin_h+end_h] server_encode = content[19+begin_h:19+begin_h+end_h]
if server_encode: if server_encode:
try: try:
if lookup(server_encode): if lookup(server_encode):
@ -455,16 +464,16 @@ class IMDbHTTPAccessSystem(IMDbBase):
results is the maximum number of results to be retrieved.""" results is the maximum number of results to be retrieved."""
if isinstance(ton, unicode): if isinstance(ton, unicode):
try: try:
ton = ton.encode('iso8859-1') ton = ton.encode('utf-8')
except Exception, e: except Exception, e:
try: try:
ton = ton.encode('utf-8') ton = ton.encode('iso8859-1')
except Exception, e: except Exception, e:
pass pass
##params = 'q=%s&%s=on&mx=%s' % (quote_plus(ton), kind, str(results)) ##params = 'q=%s&%s=on&mx=%s' % (quote_plus(ton), kind, str(results))
params = 'q=%s;s=%s;mx=%s' % (quote_plus(ton), kind, str(results)) params = 'q=%s&s=%s&mx=%s' % (quote_plus(ton), kind, str(results))
if kind == 'ep': if kind == 'ep':
params = params.replace('s=ep;', 's=tt;ttype=ep;', 1) params = params.replace('s=ep&', 's=tt&ttype=ep&', 1)
cont = self._retrieve(self.urls['find'] % params) cont = self._retrieve(self.urls['find'] % params)
#print 'URL:', imdbURL_find % params #print 'URL:', imdbURL_find % params
if cont.find('Your search returned more than') == -1 or \ if cont.find('Your search returned more than') == -1 or \
@ -472,7 +481,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
return cont return cont
# The retrieved page contains no results, because too many # The retrieved page contains no results, because too many
# titles or names contain the string we're looking for. # titles or names contain the string we're looking for.
params = 'q=%s;ls=%s;lm=0' % (quote_plus(ton), kind) params = 'q=%s&ls=%s&lm=0' % (quote_plus(ton), kind)
size = 131072 + results * 512 size = 131072 + results * 512
return self._retrieve(self.urls['find'] % params, size=size) return self._retrieve(self.urls['find'] % params, size=size)
@ -587,6 +596,10 @@ class IMDbHTTPAccessSystem(IMDbBase):
cont = self._retrieve(self.urls['movie_main'] % movieID + 'recommendations') cont = self._retrieve(self.urls['movie_main'] % movieID + 'recommendations')
return self.mProxy.rec_parser.parse(cont) return self.mProxy.rec_parser.parse(cont)
def get_movie_critic_reviews(self, movieID):
cont = self._retrieve(self.urls['movie_main'] % movieID + 'criticreviews')
return self.mProxy.criticrev_parser.parse(cont)
def get_movie_external_reviews(self, movieID): def get_movie_external_reviews(self, movieID):
cont = self._retrieve(self.urls['movie_main'] % movieID + 'externalreviews') cont = self._retrieve(self.urls['movie_main'] % movieID + 'externalreviews')
return self.mProxy.externalrev_parser.parse(cont) return self.mProxy.externalrev_parser.parse(cont)
@ -754,7 +767,7 @@ class IMDbHTTPAccessSystem(IMDbBase):
return self.pProxy.person_keywords_parser.parse(cont) return self.pProxy.person_keywords_parser.parse(cont)
def _search_character(self, name, results): def _search_character(self, name, results):
cont = self._get_search_content('char', name, results) cont = self._get_search_content('ch', name, results)
return self.scProxy.search_character_parser.parse(cont, results=results)['data'] return self.scProxy.search_character_parser.parse(cont, results=results)['data']
def get_character_main(self, characterID): def get_character_main(self, characterID):

View File

@ -9,7 +9,7 @@ pages would be:
plot summary: http://akas.imdb.com/title/tt0094226/plotsummary plot summary: http://akas.imdb.com/title/tt0094226/plotsummary
...and so on... ...and so on...
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -531,9 +531,6 @@ class DOMHTMLMovieParser(DOMParserBase):
def _process_plotsummary(x): def _process_plotsummary(x):
"""Process a plot (contributed by Rdian06).""" """Process a plot (contributed by Rdian06)."""
xauthor = x.get('author') xauthor = x.get('author')
if xauthor:
xauthor = xauthor.replace('{', '<').replace('}', '>').replace('(',
'<').replace(')', '>').strip()
xplot = x.get('plot', u'').strip() xplot = x.get('plot', u'').strip()
if xauthor: if xauthor:
xplot += u'::%s' % xauthor xplot += u'::%s' % xauthor
@ -555,17 +552,20 @@ class DOMHTMLPlotParser(DOMParserBase):
# Notice that recently IMDb started to put the email of the # Notice that recently IMDb started to put the email of the
# author only in the link, that we're not collecting, here. # author only in the link, that we're not collecting, here.
extractors = [Extractor(label='plot', extractors = [Extractor(label='plot',
path="//p[@class='plotpar']", path="//ul[@class='zebraList']//p",
attrs=Attribute(key='plot', attrs=Attribute(key='plot',
multi=True, multi=True,
path={'plot': './text()', path={'plot': './text()[1]',
'author': './i/a/text()'}, 'author': './span/em/a/text()'},
postprocess=_process_plotsummary))] postprocess=_process_plotsummary))]
def _process_award(x): def _process_award(x):
award = {} award = {}
award['award'] = x.get('award').strip() _award = x.get('award')
if _award is not None:
_award = _award.strip()
award['award'] = _award
if not award['award']: if not award['award']:
return {} return {}
award['year'] = x.get('year').strip() award['year'] = x.get('year').strip()
@ -709,10 +709,16 @@ class DOMHTMLTaglinesParser(DOMParserBase):
result = tparser.parse(taglines_html_string) result = tparser.parse(taglines_html_string)
""" """
extractors = [Extractor(label='taglines', extractors = [Extractor(label='taglines',
path="//div[@id='tn15content']/p", path='//*[contains(concat(" ", normalize-space(@class), " "), " soda ")]',
attrs=Attribute(key='taglines', multi=True, attrs=Attribute(key='taglines',
multi=True,
path="./text()"))] path="./text()"))]
def postprocess_data(self, data):
if 'taglines' in data:
data['taglines'] = [tagline.strip() for tagline in data['taglines']]
return data
class DOMHTMLKeywordsParser(DOMParserBase): class DOMHTMLKeywordsParser(DOMParserBase):
"""Parser for the "keywords" page of a given movie. """Parser for the "keywords" page of a given movie.
@ -785,9 +791,9 @@ class DOMHTMLSoundtrackParser(DOMHTMLAlternateVersionsParser):
] ]
def postprocess_data(self, data): def postprocess_data(self, data):
if 'soundtrack' in data: if 'alternate versions' in data:
nd = [] nd = []
for x in data['soundtrack']: for x in data['alternate versions']:
ds = x.split('\n') ds = x.split('\n')
title = ds[0] title = ds[0]
if title[0] == '"' and title[-1] == '"': if title[0] == '"' and title[-1] == '"':
@ -846,6 +852,13 @@ class DOMHTMLCrazyCreditsParser(DOMParserBase):
x.replace('\n', ' ').replace(' ', ' ')))] x.replace('\n', ' ').replace(' ', ' ')))]
def _process_goof(x):
if x['spoiler_category']:
return x['spoiler_category'].strip() + ': SPOILER: ' + x['text'].strip()
else:
return x['category'].strip() + ': ' + x['text'].strip()
class DOMHTMLGoofsParser(DOMParserBase): class DOMHTMLGoofsParser(DOMParserBase):
"""Parser for the "goofs" page of a given movie. """Parser for the "goofs" page of a given movie.
The page should be provided as a string, as taken from The page should be provided as a string, as taken from
@ -858,9 +871,14 @@ class DOMHTMLGoofsParser(DOMParserBase):
""" """
_defGetRefs = True _defGetRefs = True
extractors = [Extractor(label='goofs', path="//ul[@class='trivia']/li", extractors = [Extractor(label='goofs', path="//div[@class='soda odd']",
attrs=Attribute(key='goofs', multi=True, path=".//text()", attrs=Attribute(key='goofs', multi=True,
postprocess=lambda x: (x or u'').strip()))] path={
'text':"./text()",
'category':'./preceding-sibling::h4[1]/text()',
'spoiler_category': './h4/text()'
},
postprocess=_process_goof))]
class DOMHTMLQuotesParser(DOMParserBase): class DOMHTMLQuotesParser(DOMParserBase):
@ -876,9 +894,16 @@ class DOMHTMLQuotesParser(DOMParserBase):
_defGetRefs = True _defGetRefs = True
extractors = [ extractors = [
Extractor(label='quotes', Extractor(label='quotes_odd',
path="//div[@class='_imdbpy']", path="//div[@class='quote soda odd']",
attrs=Attribute(key='quotes', attrs=Attribute(key='quotes_odd',
multi=True,
path=".//text()",
postprocess=lambda x: x.strip().replace(' \n',
'::').replace('::\n', '::').replace('\n', ' '))),
Extractor(label='quotes_even',
path="//div[@class='quote soda even']",
attrs=Attribute(key='quotes_even',
multi=True, multi=True,
path=".//text()", path=".//text()",
postprocess=lambda x: x.strip().replace(' \n', postprocess=lambda x: x.strip().replace(' \n',
@ -886,27 +911,23 @@ class DOMHTMLQuotesParser(DOMParserBase):
] ]
preprocessors = [ preprocessors = [
(re.compile('(<a name="?qt[0-9]{7}"?></a>)', re.I), (re.compile('<a href="#" class="hidesoda hidden">Hide options</a><br>', re.I), '')
r'\1<div class="_imdbpy">'),
(re.compile('<hr width="30%">', re.I), '</div>'),
(re.compile('<hr/>', re.I), '</div>'),
(re.compile('<script.*?</script>', re.I|re.S), ''),
# For BeautifulSoup.
(re.compile('<!-- sid: t-channel : MIDDLE_CENTER -->', re.I), '</div>')
] ]
def preprocess_dom(self, dom): def preprocess_dom(self, dom):
# Remove "link this quote" links. # Remove "link this quote" links.
for qLink in self.xpath(dom, "//p[@class='linksoda']"): for qLink in self.xpath(dom, "//span[@class='linksoda']"):
qLink.drop_tree()
for qLink in self.xpath(dom, "//div[@class='sharesoda_pre']"):
qLink.drop_tree() qLink.drop_tree()
return dom return dom
def postprocess_data(self, data): def postprocess_data(self, data):
if 'quotes' not in data: quotes = data.get('quotes_odd', []) + data.get('quotes_even', [])
if not quotes:
return {} return {}
for idx, quote in enumerate(data['quotes']): quotes = [q.split('::') for q in quotes]
data['quotes'][idx] = quote.split('::') return {'quotes': quotes}
return data
class DOMHTMLReleaseinfoParser(DOMParserBase): class DOMHTMLReleaseinfoParser(DOMParserBase):
@ -920,13 +941,13 @@ class DOMHTMLReleaseinfoParser(DOMParserBase):
result = rdparser.parse(releaseinfo_html_string) result = rdparser.parse(releaseinfo_html_string)
""" """
extractors = [Extractor(label='release dates', extractors = [Extractor(label='release dates',
path="//th[@class='xxxx']/../../tr", path="//table[@id='release_dates']//tr",
attrs=Attribute(key='release dates', multi=True, attrs=Attribute(key='release dates', multi=True,
path={'country': ".//td[1]//text()", path={'country': ".//td[1]//text()",
'date': ".//td[2]//text()", 'date': ".//td[2]//text()",
'notes': ".//td[3]//text()"})), 'notes': ".//td[3]//text()"})),
Extractor(label='akas', Extractor(label='akas',
path="//div[@class='_imdbpy_akas']/table/tr", path="//table[@id='akas']//tr",
attrs=Attribute(key='akas', multi=True, attrs=Attribute(key='akas', multi=True,
path={'title': "./td[1]/text()", path={'title': "./td[1]/text()",
'countries': "./td[2]/text()"}))] 'countries': "./td[2]/text()"}))]
@ -961,7 +982,7 @@ class DOMHTMLReleaseinfoParser(DOMParserBase):
title = (aka.get('title') or '').strip() title = (aka.get('title') or '').strip()
if not title: if not title:
continue continue
countries = (aka.get('countries') or '').split('/') countries = (aka.get('countries') or '').split(',')
if not countries: if not countries:
nakas.append(title) nakas.append(title)
else: else:
@ -1135,6 +1156,27 @@ def _normalize_href(href):
href = '%s%s' % (imdbURL_base, href) href = '%s%s' % (imdbURL_base, href)
return href return href
class DOMHTMLCriticReviewsParser(DOMParserBase):
"""Parser for the "critic reviews" pages of a given movie.
The page should be provided as a string, as taken from
the akas.imdb.com server. The final result will be a
dictionary, with a key for every relevant section.
Example:
osparser = DOMHTMLCriticReviewsParser()
result = osparser.parse(officialsites_html_string)
"""
kind = 'critic reviews'
extractors = [
Extractor(label='metascore',
path="//div[@class='metascore_wrap']/div/span",
attrs=Attribute(key='metascore',
path=".//text()")),
Extractor(label='metacritic url',
path="//div[@class='article']/div[@class='see-more']/a",
attrs=Attribute(key='metacritic url',
path="./@href")) ]
class DOMHTMLOfficialsitesParser(DOMParserBase): class DOMHTMLOfficialsitesParser(DOMParserBase):
"""Parser for the "official sites", "external reviews", "newsgroup """Parser for the "official sites", "external reviews", "newsgroup
@ -1471,6 +1513,14 @@ class DOMHTMLSeasonEpisodesParser(DOMParserBase):
try: selected_season = int(selected_season) try: selected_season = int(selected_season)
except: pass except: pass
nd = {selected_season: {}} nd = {selected_season: {}}
if 'episode -1' in data:
counter = 1
for episode in data['episode -1']:
while 'episode %d' % counter in data:
counter += 1
k = 'episode %d' % counter
data[k] = [episode]
del data['episode -1']
for episode_nr, episode in data.iteritems(): for episode_nr, episode in data.iteritems():
if not (episode and episode[0] and if not (episode and episode[0] and
episode_nr.startswith('episode ')): episode_nr.startswith('episode ')):
@ -1860,6 +1910,8 @@ _OBJECTS = {
'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None), 'releasedates_parser': ((DOMHTMLReleaseinfoParser,), None),
'ratings_parser': ((DOMHTMLRatingsParser,), None), 'ratings_parser': ((DOMHTMLRatingsParser,), None),
'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None), 'officialsites_parser': ((DOMHTMLOfficialsitesParser,), None),
'criticrev_parser': ((DOMHTMLCriticReviewsParser,),
{'kind': 'critic reviews'}),
'externalrev_parser': ((DOMHTMLOfficialsitesParser,), 'externalrev_parser': ((DOMHTMLOfficialsitesParser,),
{'kind': 'external reviews'}), {'kind': 'external reviews'}),
'newsgrouprev_parser': ((DOMHTMLOfficialsitesParser,), 'newsgrouprev_parser': ((DOMHTMLOfficialsitesParser,),

View File

@ -8,7 +8,7 @@ E.g., for "Mel Gibson" the referred pages would be:
biography: http://akas.imdb.com/name/nm0000154/bio biography: http://akas.imdb.com/name/nm0000154/bio
...and so on... ...and so on...
Copyright 2004-20101 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -60,6 +60,7 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
result = cparser.parse(categorized_html_string) result = cparser.parse(categorized_html_string)
""" """
_containsObjects = True _containsObjects = True
_name_imdb_index = re.compile(r'\([IVXLCDM]+\)')
_birth_attrs = [Attribute(key='birth date', _birth_attrs = [Attribute(key='birth date',
path='.//time[@itemprop="birthDate"]/@datetime'), path='.//time[@itemprop="birthDate"]/@datetime'),
@ -100,6 +101,10 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
path=".//text()", path=".//text()",
postprocess=lambda x: analyze_name(x, postprocess=lambda x: analyze_name(x,
canonical=1))), canonical=1))),
Extractor(label='name_index',
path="//h1[@class='header']/span[1]",
attrs=Attribute(key='name_index',
path="./text()")),
Extractor(label='birth info', Extractor(label='birth info',
path="//div[h4='Born:']", path="//div[h4='Born:']",
@ -110,7 +115,7 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
attrs=_death_attrs), attrs=_death_attrs),
Extractor(label='headshot', Extractor(label='headshot',
path="//td[@id='img_primary']/a", path="//td[@id='img_primary']/div[@class='image']/a",
attrs=Attribute(key='headshot', attrs=Attribute(key='headshot',
path="./img/@src")), path="./img/@src")),
@ -152,6 +157,11 @@ class DOMHTMLMaindetailsParser(DOMParserBase):
for what in 'birth date', 'death date': for what in 'birth date', 'death date':
if what in data and not data[what]: if what in data and not data[what]:
del data[what] del data[what]
name_index = (data.get('name_index') or '').strip()
if name_index:
if self._name_imdb_index.match(name_index):
data['imdbIndex'] = name_index[1:-1]
del data['name_index']
# XXX: the code below is for backwards compatibility # XXX: the code below is for backwards compatibility
# probably could be removed # probably could be removed
for key in data.keys(): for key in data.keys():
@ -220,13 +230,13 @@ class DOMHTMLBioParser(DOMParserBase):
attrs=Attribute(key='headshot', attrs=Attribute(key='headshot',
path="./img/@src")), path="./img/@src")),
Extractor(label='birth info', Extractor(label='birth info',
path="//div[h5='Date of Birth']", path="//table[@id='overviewTable']//td[text()='Date of Birth']/following-sibling::td[1]",
attrs=_birth_attrs), attrs=_birth_attrs),
Extractor(label='death info', Extractor(label='death info',
path="//div[h5='Date of Death']", path="//table[@id='overviewTable']//td[text()='Date of Death']/following-sibling::td[1]",
attrs=_death_attrs), attrs=_death_attrs),
Extractor(label='nick names', Extractor(label='nick names',
path="//div[h5='Nickname']", path="//table[@id='overviewTable']//td[text()='Nickenames']/following-sibling::td[1]",
attrs=Attribute(key='nick names', attrs=Attribute(key='nick names',
path="./text()", path="./text()",
joiner='|', joiner='|',
@ -234,25 +244,25 @@ class DOMHTMLBioParser(DOMParserBase):
'::(', 1) for n in x.split('|') '::(', 1) for n in x.split('|')
if n.strip()])), if n.strip()])),
Extractor(label='birth name', Extractor(label='birth name',
path="//div[h5='Birth Name']", path="//table[@id='overviewTable']//td[text()='Birth Name']/following-sibling::td[1]",
attrs=Attribute(key='birth name', attrs=Attribute(key='birth name',
path="./text()", path="./text()",
postprocess=lambda x: canonicalName(x.strip()))), postprocess=lambda x: canonicalName(x.strip()))),
Extractor(label='height', Extractor(label='height',
path="//div[h5='Height']", path="//table[@id='overviewTable']//td[text()='Height']/following-sibling::td[1]",
attrs=Attribute(key='height', attrs=Attribute(key='height',
path="./text()", path="./text()",
postprocess=lambda x: x.strip())), postprocess=lambda x: x.strip())),
Extractor(label='mini biography', Extractor(label='mini biography',
path="//div[h5='Mini Biography']", path="//a[@name='mini_bio']/following-sibling::div[1 = count(preceding-sibling::a[1] | ../a[@name='mini_bio'])]",
attrs=Attribute(key='mini biography', attrs=Attribute(key='mini biography',
multi=True, multi=True,
path={ path={
'bio': "./p//text()", 'bio': ".//text()",
'by': "./b/following-sibling::a/text()" 'by': ".//a[@name='ba']//text()"
}, },
postprocess=lambda x: "%s::%s" % \ postprocess=lambda x: "%s::%s" % \
(x.get('bio').strip(), ((x.get('bio') or u'').split('- IMDb Mini Biography By:')[0].strip(),
(x.get('by') or u'').strip() or u'Anonymous'))), (x.get('by') or u'').strip() or u'Anonymous'))),
Extractor(label='spouse', Extractor(label='spouse',
path="//div[h5='Spouse']/table/tr", path="//div[h5='Spouse']/table/tr",

View File

@ -5,9 +5,9 @@ This module provides the HTMLSearchCharacterParser class (and the
search_character_parser instance), used to parse the results of a search search_character_parser instance), used to parse the results of a search
for a given character. for a given character.
E.g., when searching for the name "Jesse James", the parsed page would be: E.g., when searching for the name "Jesse James", the parsed page would be:
http://akas.imdb.com/find?s=Characters;mx=20;q=Jesse+James http://akas.imdb.com/find?s=ch;mx=20;q=Jesse+James
Copyright 2007-2009 Davide Alberani <da@erlug.linux.it> Copyright 2007-2012 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -42,7 +42,7 @@ class DOMBasicCharacterParser(DOMBasicMovieParser):
class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser): class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser):
_BaseParser = DOMBasicCharacterParser _BaseParser = DOMBasicCharacterParser
_notDirectHitTitle = '<title>imdb search' _notDirectHitTitle = '<title>find - imdb'
_titleBuilder = lambda self, x: build_name(x, canonical=False) _titleBuilder = lambda self, x: build_name(x, canonical=False)
_linkPrefix = '/character/ch' _linkPrefix = '/character/ch'
@ -57,7 +57,7 @@ class DOMHTMLSearchCharacterParser(DOMHTMLSearchMovieParser):
{'name': x.get('name')} {'name': x.get('name')}
))] ))]
extractors = [Extractor(label='search', extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, " \ path="//td[@class='result_text']/a[starts-with(@href, " \
"'/character/ch')]/..", "'/character/ch')]/..",
attrs=_attrs)] attrs=_attrs)]

View File

@ -7,7 +7,7 @@ for a given company.
E.g., when searching for the name "Columbia Pictures", the parsed page would be: E.g., when searching for the name "Columbia Pictures", the parsed page would be:
http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures http://akas.imdb.com/find?s=co;mx=20;q=Columbia+Pictures
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it> Copyright 2008-2012 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -42,7 +42,7 @@ class DOMBasicCompanyParser(DOMBasicMovieParser):
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser): class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
_BaseParser = DOMBasicCompanyParser _BaseParser = DOMBasicCompanyParser
_notDirectHitTitle = '<title>imdb company' _notDirectHitTitle = '<title>find - imdb'
_titleBuilder = lambda self, x: build_company_name(x) _titleBuilder = lambda self, x: build_company_name(x)
_linkPrefix = '/company/co' _linkPrefix = '/company/co'
@ -59,7 +59,7 @@ class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
or u''), stripNotes=True) or u''), stripNotes=True)
))] ))]
extractors = [Extractor(label='search', extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, " \ path="//td[@class='result_text']/a[starts-with(@href, " \
"'/company/co')]/..", "'/company/co')]/..",
attrs=_attrs)] attrs=_attrs)]

View File

@ -8,7 +8,7 @@ E.g., for when searching for the title "the passion", the parsed
page would be: page would be:
http://akas.imdb.com/find?q=the+passion&tt=on&mx=20 http://akas.imdb.com/find?q=the+passion&tt=on&mx=20
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -77,7 +77,7 @@ class DOMBasicMovieParser(DOMParserBase):
def custom_analyze_title(title): def custom_analyze_title(title):
"""Remove garbage notes after the (year), (year/imdbIndex) or (year) (TV)""" """Remove garbage notes after the (year), (year/imdbIndex) or (year) (TV)"""
# XXX: very crappy. :-( # XXX: very crappy. :-(
nt = title.split(' ')[0] nt = title.split(' aka ')[0]
if nt: if nt:
title = nt title = nt
if not title: if not title:
@ -92,7 +92,7 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
"new search system" is used, for movies.""" "new search system" is used, for movies."""
_BaseParser = DOMBasicMovieParser _BaseParser = DOMBasicMovieParser
_notDirectHitTitle = '<title>imdb title' _notDirectHitTitle = '<title>find - imdb</title>'
_titleBuilder = lambda self, x: build_title(x) _titleBuilder = lambda self, x: build_title(x)
_linkPrefix = '/title/tt' _linkPrefix = '/title/tt'
@ -101,8 +101,7 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
path={ path={
'link': "./a[1]/@href", 'link': "./a[1]/@href",
'info': ".//text()", 'info': ".//text()",
#'akas': ".//div[@class='_imdbpyAKA']//text()" 'akas': "./i//text()"
'akas': ".//p[@class='find-aka']//text()"
}, },
postprocess=lambda x: ( postprocess=lambda x: (
analyze_imdbid(x.get('link') or u''), analyze_imdbid(x.get('link') or u''),
@ -110,7 +109,7 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
x.get('akas') x.get('akas')
))] ))]
extractors = [Extractor(label='search', extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, '/title/tt')]/..", path="//td[@class='result_text']",
attrs=_attrs)] attrs=_attrs)]
def _init(self): def _init(self):
self.url = u'' self.url = u''
@ -119,14 +118,11 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
self.url = u'' self.url = u''
def preprocess_string(self, html_string): def preprocess_string(self, html_string):
if self._notDirectHitTitle in html_string[:1024].lower(): if self._notDirectHitTitle in html_string[:10240].lower():
if self._linkPrefix == '/title/tt': if self._linkPrefix == '/title/tt':
# Only for movies. # Only for movies.
# XXX (HTU): does this still apply?
html_string = html_string.replace('(TV mini-series)', '(mini)') html_string = html_string.replace('(TV mini-series)', '(mini)')
html_string = html_string.replace('<p class="find-aka">',
'<p class="find-aka">::')
#html_string = _reAKAStitles.sub(
# r'<div class="_imdbpyAKA">\1::</div>\2', html_string)
return html_string return html_string
# Direct hit! # Direct hit!
dbme = self._BaseParser(useModule=self._useModule) dbme = self._BaseParser(useModule=self._useModule)
@ -141,7 +137,7 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
title = self._titleBuilder(res[0][1]) title = self._titleBuilder(res[0][1])
if not (link and title): return u'' if not (link and title): return u''
link = link.replace('http://pro.imdb.com', '') link = link.replace('http://pro.imdb.com', '')
new_html = '<td></td><td></td><td><a href="%s">%s</a></td>' % (link, new_html = '<td class="result_text"><a href="%s">%s</a></td>' % (link,
title) title)
return new_html return new_html
@ -161,11 +157,14 @@ class DOMHTMLSearchMovieParser(DOMParserBase):
if not datum[0] and datum[1]: if not datum[0] and datum[1]:
continue continue
if datum[2] is not None: if datum[2] is not None:
akas = filter(None, datum[2].split('::')) #akas = filter(None, datum[2].split('::'))
if self._linkPrefix == '/title/tt': if self._linkPrefix == '/title/tt':
akas = [a.replace('" - ', '::').rstrip() for a in akas] # XXX (HTU): couldn't find a result with multiple akas
akas = [a.replace('aka "', '', 1).replace('aka "', aka = datum[2]
'', 1).lstrip() for a in akas] akas = [aka[1:-1]] # remove the quotes
#akas = [a.replace('" - ', '::').rstrip() for a in akas]
#akas = [a.replace('aka "', '', 1).replace('aka "',
#'', 1).lstrip() for a in akas]
datum[1]['akas'] = akas datum[1]['akas'] = akas
data['data'][idx] = (datum[0], datum[1]) data['data'][idx] = (datum[0], datum[1])
else: else:

View File

@ -7,7 +7,7 @@ for a given person.
E.g., when searching for the name "Mel Gibson", the parsed page would be: E.g., when searching for the name "Mel Gibson", the parsed page would be:
http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20 http://akas.imdb.com/find?q=Mel+Gibson&nm=on&mx=20
Copyright 2004-2010 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2008 H. Turgut Uyar <uyar@tekir.org> 2008 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -55,7 +55,7 @@ class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
"""Parse the html page that the IMDb web server shows when the """Parse the html page that the IMDb web server shows when the
"new search system" is used, for persons.""" "new search system" is used, for persons."""
_BaseParser = DOMBasicPersonParser _BaseParser = DOMBasicPersonParser
_notDirectHitTitle = '<title>imdb name' _notDirectHitTitle = '<title>find - imdb'
_titleBuilder = lambda self, x: build_name(x, canonical=True) _titleBuilder = lambda self, x: build_name(x, canonical=True)
_linkPrefix = '/name/nm' _linkPrefix = '/name/nm'
@ -74,11 +74,11 @@ class DOMHTMLSearchPersonParser(DOMHTMLSearchMovieParser):
canonical=1), x.get('akas') canonical=1), x.get('akas')
))] ))]
extractors = [Extractor(label='search', extractors = [Extractor(label='search',
path="//td[3]/a[starts-with(@href, '/name/nm')]/..", path="//td[@class='result_text']/a[starts-with(@href, '/name/nm')]/..",
attrs=_attrs)] attrs=_attrs)]
def preprocess_string(self, html_string): def preprocess_string(self, html_string):
if self._notDirectHitTitle in html_string[:1024].lower(): if self._notDirectHitTitle in html_string[:10240].lower():
html_string = _reAKASp.sub( html_string = _reAKASp.sub(
r'\1<div class="_imdbpyAKA">\2::</div>\3', r'\1<div class="_imdbpyAKA">\2::</div>\3',
html_string) html_string)

View File

@ -340,7 +340,7 @@ def build_movie(txt, movieID=None, roleID=None, status=None,
title = title[:nidx].rstrip() title = title[:nidx].rstrip()
if year: if year:
year = year.strip() year = year.strip()
if title[-1] == ')': if title[-1:] == ')':
fpIdx = title.rfind('(') fpIdx = title.rfind('(')
if fpIdx != -1: if fpIdx != -1:
if notes: notes = '%s %s' % (title[fpIdx:], notes) if notes: notes = '%s %s' % (title[fpIdx:], notes)

View File

@ -6,7 +6,7 @@ IMDb's data for mobile systems.
the imdb.IMDb function will return an instance of this class when the imdb.IMDb function will return an instance of this class when
called with the 'accessSystem' argument set to "mobile". called with the 'accessSystem' argument set to "mobile".
Copyright 2005-2011 Davide Alberani <da@erlug.linux.it> Copyright 2005-2012 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -193,7 +193,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
title) title)
return res return res
tl = title[0].lower() tl = title[0].lower()
if not tl.startswith('imdb title'): if not tl.startswith('find - imdb'):
# a direct hit! # a direct hit!
title = _unHtml(title[0]) title = _unHtml(title[0])
mid = None mid = None
@ -211,7 +211,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
# XXX: this results*3 prevents some recursion errors, but... # XXX: this results*3 prevents some recursion errors, but...
# it's not exactly understandable (i.e.: why 'results' is # it's not exactly understandable (i.e.: why 'results' is
# not enough to get all the results?) # not enough to get all the results?)
lis = _findBetween(cont, 'td valign="top">', '</td>', lis = _findBetween(cont, 'td class="result_text">', '</td>',
maxRes=results*3) maxRes=results*3)
for li in lis: for li in lis:
akas = re_makas.findall(li) akas = re_makas.findall(li)
@ -492,7 +492,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
self._mobile_logger.warn('no title tag searching for name %s', name) self._mobile_logger.warn('no title tag searching for name %s', name)
return res return res
nl = name[0].lower() nl = name[0].lower()
if not nl.startswith('imdb name'): if not nl.startswith('find - imdb'):
# a direct hit! # a direct hit!
name = _unHtml(name[0]) name = _unHtml(name[0])
name = name.replace('- Filmography by type' , '').strip() name = name.replace('- Filmography by type' , '').strip()
@ -506,7 +506,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return res return res
res[:] = [(str(pid[0]), analyze_name(name, canonical=1))] res[:] = [(str(pid[0]), analyze_name(name, canonical=1))]
else: else:
lis = _findBetween(cont, 'td valign="top">', '</td>', lis = _findBetween(cont, 'td class="result_text">', '</td>',
maxRes=results*3) maxRes=results*3)
for li in lis: for li in lis:
akas = _findBetween(li, '<em>"', '"</em>') akas = _findBetween(li, '<em>"', '"</em>')
@ -771,7 +771,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return {'data': d} return {'data': d}
def _search_character(self, name, results): def _search_character(self, name, results):
cont = subXMLRefs(self._get_search_content('char', name, results)) cont = subXMLRefs(self._get_search_content('ch', name, results))
name = _findBetween(cont, '<title>', '</title>', maxRes=1) name = _findBetween(cont, '<title>', '</title>', maxRes=1)
res = [] res = []
if not name: if not name:
@ -779,8 +779,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
name) name)
return res return res
nl = name[0].lower() nl = name[0].lower()
if not (nl.startswith('imdb search') or nl.startswith('imdb search') \ if not nl.startswith('find - imdb'):
or nl.startswith('imdb character')):
# a direct hit! # a direct hit!
name = _unHtml(name[0]).replace('(Character)', '').strip() name = _unHtml(name[0]).replace('(Character)', '').strip()
pid = None pid = None
@ -793,12 +792,7 @@ class IMDbMobileAccessSystem(IMDbHTTPAccessSystem):
return res return res
res[:] = [(str(pid[0]), analyze_name(name))] res[:] = [(str(pid[0]), analyze_name(name))]
else: else:
sects = _findBetween(cont, '<b>Popular Characters</b>', '</table>', lis = _findBetween(cont, '<td class="result_text"',
maxRes=results*3)
sects += _findBetween(cont, '<b>Characters', '</table>',
maxRes=results*3)
for sect in sects:
lis = _findBetween(sect, '<a href="/character/',
['<small', '</td>', '<br']) ['<small', '</td>', '<br'])
for li in lis: for li in lis:
li = '<%s' % li li = '<%s' % li

View File

@ -7,7 +7,7 @@ the SQLObject _AND_ SQLAlchemy Object Relational Managers is available.
the imdb.IMDb function will return an instance of this class when the imdb.IMDb function will return an instance of this class when
called with the 'accessSystem' argument set to "sql", "database" or "db". called with the 'accessSystem' argument set to "sql", "database" or "db".
Copyright 2005-2010 Davide Alberani <da@erlug.linux.it> Copyright 2005-2012 Davide Alberani <da@erlug.linux.it>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -452,7 +452,12 @@ def get_movie_data(movieID, kindDict, fromAka=0, _table=None):
else: else:
if not fromAka: Table = Title if not fromAka: Table = Title
else: Table = AkaTitle else: Table = AkaTitle
try:
m = Table.get(movieID) m = Table.get(movieID)
except Exception, e:
_aux_logger.warn('Unable to fetch information for movieID %s: %s', movieID, e)
mdict = {}
return mdict
mdict = {'title': m.title, 'kind': kindDict[m.kindID], mdict = {'title': m.title, 'kind': kindDict[m.kindID],
'year': m.productionYear, 'imdbIndex': m.imdbIndex, 'year': m.productionYear, 'imdbIndex': m.imdbIndex,
'season': m.seasonNr, 'episode': m.episodeNr} 'season': m.seasonNr, 'episode': m.episodeNr}
@ -825,14 +830,14 @@ class IMDbSqlAccessSystem(IMDbBase):
imdbID = movie.imdbID imdbID = movie.imdbID
if imdbID is not None: return '%07d' % imdbID if imdbID is not None: return '%07d' % imdbID
m_dict = get_movie_data(movie.id, self._kind) m_dict = get_movie_data(movie.id, self._kind)
titline = build_title(m_dict, ptdf=1) titline = build_title(m_dict, ptdf=0)
imdbID = self.title2imdbID(titline) imdbID = self.title2imdbID(titline, m_dict['kind'])
# If the imdbID was retrieved from the web and was not in the # If the imdbID was retrieved from the web and was not in the
# database, update the database (ignoring errors, because it's # database, update the database (ignoring errors, because it's
# possibile that the current user has not update privileges). # possibile that the current user has not update privileges).
# There're times when I think I'm a genius; this one of # There're times when I think I'm a genius; this one of
# those times... <g> # those times... <g>
if imdbID is not None: if imdbID is not None and not isinstance(imdbID, list):
try: movie.imdbID = int(imdbID) try: movie.imdbID = int(imdbID)
except: pass except: pass
return imdbID return imdbID
@ -847,9 +852,9 @@ class IMDbSqlAccessSystem(IMDbBase):
imdbID = person.imdbID imdbID = person.imdbID
if imdbID is not None: return '%07d' % imdbID if imdbID is not None: return '%07d' % imdbID
n_dict = {'name': person.name, 'imdbIndex': person.imdbIndex} n_dict = {'name': person.name, 'imdbIndex': person.imdbIndex}
namline = build_name(n_dict, canonical=1) namline = build_name(n_dict, canonical=False)
imdbID = self.name2imdbID(namline) imdbID = self.name2imdbID(namline)
if imdbID is not None: if imdbID is not None and not isinstance(imdbID, list):
try: person.imdbID = int(imdbID) try: person.imdbID = int(imdbID)
except: pass except: pass
return imdbID return imdbID
@ -864,9 +869,9 @@ class IMDbSqlAccessSystem(IMDbBase):
imdbID = character.imdbID imdbID = character.imdbID
if imdbID is not None: return '%07d' % imdbID if imdbID is not None: return '%07d' % imdbID
n_dict = {'name': character.name, 'imdbIndex': character.imdbIndex} n_dict = {'name': character.name, 'imdbIndex': character.imdbIndex}
namline = build_name(n_dict, canonical=1) namline = build_name(n_dict, canonical=False)
imdbID = self.character2imdbID(namline) imdbID = self.character2imdbID(namline)
if imdbID is not None: if imdbID is not None and not isinstance(imdbID, list):
try: character.imdbID = int(imdbID) try: character.imdbID = int(imdbID)
except: pass except: pass
return imdbID return imdbID
@ -883,7 +888,7 @@ class IMDbSqlAccessSystem(IMDbBase):
n_dict = {'name': company.name, 'country': company.countryCode} n_dict = {'name': company.name, 'country': company.countryCode}
namline = build_company_name(n_dict) namline = build_company_name(n_dict)
imdbID = self.company2imdbID(namline) imdbID = self.company2imdbID(namline)
if imdbID is not None: if imdbID is not None and not isinstance(imdbID, list):
try: company.imdbID = int(imdbID) try: company.imdbID = int(imdbID)
except: pass except: pass
return imdbID return imdbID
@ -1116,6 +1121,7 @@ class IMDbSqlAccessSystem(IMDbBase):
if mlinks: if mlinks:
for ml in mlinks: for ml in mlinks:
lmovieData = get_movie_data(ml[0], self._kind) lmovieData = get_movie_data(ml[0], self._kind)
if lmovieData:
m = Movie(movieID=ml[0], data=lmovieData, accessSystem='sql') m = Movie(movieID=ml[0], data=lmovieData, accessSystem='sql')
ml[0] = m ml[0] = m
res['connections'] = {} res['connections'] = {}

View File

@ -466,6 +466,7 @@ class _AlchemyConnection(object):
def setConnection(uri, tables, encoding='utf8', debug=False): def setConnection(uri, tables, encoding='utf8', debug=False):
"""Set connection for every table.""" """Set connection for every table."""
params = {'encoding': encoding}
# FIXME: why on earth MySQL requires an additional parameter, # FIXME: why on earth MySQL requires an additional parameter,
# is well beyond my understanding... # is well beyond my understanding...
if uri.startswith('mysql'): if uri.startswith('mysql'):
@ -474,7 +475,11 @@ def setConnection(uri, tables, encoding='utf8', debug=False):
else: else:
uri += '?' uri += '?'
uri += 'charset=%s' % encoding uri += 'charset=%s' % encoding
params = {'encoding': encoding}
# On some server configurations, we will need to explictly enable
# loading data from local files
params['local_infile'] = 1
if debug: if debug:
params['echo'] = True params['echo'] = True
if uri.startswith('ibm_db'): if uri.startswith('ibm_db'):

Binary file not shown.

View File

@ -182,6 +182,10 @@ def setConnection(uri, tables, encoding='utf8', debug=False):
kw['use_unicode'] = 1 kw['use_unicode'] = 1
#kw['sqlobject_encoding'] = encoding #kw['sqlobject_encoding'] = encoding
kw['charset'] = encoding kw['charset'] = encoding
# On some server configurations, we will need to explictly enable
# loading data from local files
kw['local_infile'] = 1
conn = connectionForURI(uri, **kw) conn = connectionForURI(uri, **kw)
conn.debug = debug conn.debug = debug
# XXX: doesn't work and a work-around was put in imdbpy2sql.py; # XXX: doesn't work and a work-around was put in imdbpy2sql.py;

View File

@ -3,7 +3,7 @@ utils module (imdb package).
This module provides basic utilities for the imdb package. This module provides basic utilities for the imdb package.
Copyright 2004-2012 Davide Alberani <da@erlug.linux.it> Copyright 2004-2013 Davide Alberani <da@erlug.linux.it>
2009 H. Turgut Uyar <uyar@tekir.org> 2009 H. Turgut Uyar <uyar@tekir.org>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@ -189,10 +189,9 @@ _unicodeArticles = linguistics.toUnicode(_articles)
articlesDicts = linguistics.articlesDictsForLang(None) articlesDicts = linguistics.articlesDictsForLang(None)
spArticles = linguistics.spArticlesForLang(None) spArticles = linguistics.spArticlesForLang(None)
def canonicalTitle(title, lang=None): def canonicalTitle(title, lang=None, imdbIndex=None):
"""Return the title in the canonic format 'Movie Title, The'; """Return the title in the canonic format 'Movie Title, The';
beware that it doesn't handle long imdb titles, but only the beware that it doesn't handle long imdb titles.
title portion, without year[/imdbIndex] or special markup.
The 'lang' argument can be used to specify the language of the title. The 'lang' argument can be used to specify the language of the title.
""" """
isUnicode = isinstance(title, unicode) isUnicode = isinstance(title, unicode)
@ -203,15 +202,19 @@ def canonicalTitle(title, lang=None):
except IndexError: except IndexError:
pass pass
if isUnicode: if isUnicode:
_format = u'%s, %s' _format = u'%s%s, %s'
else: else:
_format = '%s, %s' _format = '%s%s, %s'
ltitle = title.lower() ltitle = title.lower()
if imdbIndex:
imdbIndex = ' (%s)' % imdbIndex
else:
imdbIndex = ''
spArticles = linguistics.spArticlesForLang(lang) spArticles = linguistics.spArticlesForLang(lang)
for article in spArticles[isUnicode]: for article in spArticles[isUnicode]:
if ltitle.startswith(article): if ltitle.startswith(article):
lart = len(article) lart = len(article)
title = _format % (title[lart:], title[:lart]) title = _format % (title[lart:], imdbIndex, title[:lart])
if article[-1] == ' ': if article[-1] == ' ':
title = title[:-1] title = title[:-1]
break break
@ -383,18 +386,42 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
if title.endswith('(TV)'): if title.endswith('(TV)'):
kind = u'tv movie' kind = u'tv movie'
title = title[:-4].rstrip() title = title[:-4].rstrip()
elif title.endswith('(TV Movie)'):
kind = u'tv movie'
title = title[:-10].rstrip()
elif title.endswith('(V)'): elif title.endswith('(V)'):
kind = u'video movie' kind = u'video movie'
title = title[:-3].rstrip() title = title[:-3].rstrip()
elif title.endswith('(video)'): elif title.lower().endswith('(video)'):
kind = u'video movie' kind = u'video movie'
title = title[:-7].rstrip() title = title[:-7].rstrip()
elif title.endswith('(TV Short)'):
kind = u'tv short'
title = title[:-10].rstrip()
elif title.endswith('(TV Mini-Series)'):
kind = u'tv mini series'
title = title[:-16].rstrip()
elif title.endswith('(mini)'): elif title.endswith('(mini)'):
kind = u'tv mini series' kind = u'tv mini series'
title = title[:-6].rstrip() title = title[:-6].rstrip()
elif title.endswith('(VG)'): elif title.endswith('(VG)'):
kind = u'video game' kind = u'video game'
title = title[:-4].rstrip() title = title[:-4].rstrip()
elif title.endswith('(Video Game)'):
kind = u'video game'
title = title[:-12].rstrip()
elif title.endswith('(TV Series)'):
epindex = title.find('(TV Episode) - ')
if epindex >= 0:
# It's an episode of a series.
kind = u'episode'
series_info = analyze_title(title[epindex + 15:])
result['episode of'] = series_info.get('title')
result['series year'] = series_info.get('year')
title = title[:epindex]
else:
kind = u'tv series'
title = title[:-11].rstrip()
# Search for the year and the optional imdbIndex (a roman number). # Search for the year and the optional imdbIndex (a roman number).
yi = re_year_index.findall(title) yi = re_year_index.findall(title)
if not yi: if not yi:
@ -430,9 +457,6 @@ def analyze_title(title, canonical=None, canonicalSeries=None,
if not kind: if not kind:
kind = u'tv series' kind = u'tv series'
title = title[1:-1].strip() title = title[1:-1].strip()
elif title.endswith('(TV series)'):
kind = u'tv series'
title = title[:-11].rstrip()
if not title: if not title:
raise IMDbParserError('invalid title: "%s"' % original_t) raise IMDbParserError('invalid title: "%s"' % original_t)
if canonical is not None: if canonical is not None:
@ -489,7 +513,7 @@ def _convertTime(title, fromPTDFtoWEB=1, _emptyString=u''):
def build_title(title_dict, canonical=None, canonicalSeries=None, def build_title(title_dict, canonical=None, canonicalSeries=None,
canonicalEpisode=None, ptdf=0, lang=None, _doYear=1, canonicalEpisode=None, ptdf=0, lang=None, _doYear=1,
_emptyString=u''): _emptyString=u'', appendKind=True):
"""Given a dictionary that represents a "long" IMDb title, """Given a dictionary that represents a "long" IMDb title,
return a string. return a string.
@ -511,6 +535,11 @@ def build_title(title_dict, canonical=None, canonicalSeries=None,
doYear = 0 doYear = 0
if ptdf: if ptdf:
doYear = 1 doYear = 1
# XXX: for results coming from the new search page.
if not isinstance(episode_of, (dict, _Container)):
episode_of = {'title': episode_of, 'kind': 'tv series'}
if 'series year' in title_dict:
episode_of['year'] = title_dict['series year']
pre_title = build_title(episode_of, canonical=canonicalSeries, pre_title = build_title(episode_of, canonical=canonicalSeries,
ptdf=0, _doYear=doYear, ptdf=0, _doYear=doYear,
_emptyString=_emptyString) _emptyString=_emptyString)
@ -545,12 +574,14 @@ def build_title(title_dict, canonical=None, canonicalSeries=None,
episode_title += '.%s' % episode episode_title += '.%s' % episode
episode_title += ')' episode_title += ')'
episode_title = '{%s}' % episode_title episode_title = '{%s}' % episode_title
return '%s %s' % (pre_title, episode_title) return _emptyString + '%s %s' % (_emptyString + pre_title,
_emptyString + episode_title)
title = title_dict.get('title', '') title = title_dict.get('title', '')
imdbIndex = title_dict.get('imdbIndex', '')
if not title: return _emptyString if not title: return _emptyString
if canonical is not None: if canonical is not None:
if canonical: if canonical:
title = canonicalTitle(title, lang=lang) title = canonicalTitle(title, lang=lang, imdbIndex=imdbIndex)
else: else:
title = normalizeTitle(title, lang=lang) title = normalizeTitle(title, lang=lang)
if pre_title: if pre_title:
@ -558,15 +589,20 @@ def build_title(title_dict, canonical=None, canonicalSeries=None,
if kind in (u'tv series', u'tv mini series'): if kind in (u'tv series', u'tv mini series'):
title = '"%s"' % title title = '"%s"' % title
if _doYear: if _doYear:
imdbIndex = title_dict.get('imdbIndex') year = title_dict.get('year') or '????'
year = title_dict.get('year') or u'????'
if isinstance(_emptyString, str): if isinstance(_emptyString, str):
year = str(year) year = str(year)
imdbIndex = title_dict.get('imdbIndex')
if not ptdf:
if imdbIndex and (canonical is None or canonical):
title += ' (%s)' % imdbIndex
title += ' (%s)' % year
else:
title += ' (%s' % year title += ' (%s' % year
if imdbIndex: if imdbIndex and (canonical is None or canonical):
title += '/%s' % imdbIndex title += '/%s' % imdbIndex
title += ')' title += ')'
if kind: if appendKind and kind:
if kind == 'tv movie': if kind == 'tv movie':
title += ' (TV)' title += ' (TV)'
elif kind == 'video movie': elif kind == 'video movie':

View File

@ -11,6 +11,7 @@ __author__ = "dbr/Ben"
__version__ = "1.9" __version__ = "1.9"
import os import os
import re
import time import time
import getpass import getpass
import StringIO import StringIO
@ -18,8 +19,10 @@ import tempfile
import warnings import warnings
import logging import logging
import zipfile import zipfile
import datetime as dt
import requests import requests
import cachecontrol import cachecontrol
import xmltodict
try: try:
import xml.etree.cElementTree as ElementTree import xml.etree.cElementTree as ElementTree
@ -31,6 +34,7 @@ try:
except ImportError: except ImportError:
gzip = None gzip = None
from lib.dateutil.parser import parse
from cachecontrol import caches from cachecontrol import caches
from tvdb_ui import BaseUI, ConsoleUI from tvdb_ui import BaseUI, ConsoleUI
@ -560,35 +564,60 @@ class Tvdb:
except requests.Timeout, e: except requests.Timeout, e:
raise tvdb_error("Connection timed out " + str(e.message) + " while loading URL " + str(url)) raise tvdb_error("Connection timed out " + str(e.message) + " while loading URL " + str(url))
if 'application/zip' in resp.headers.get("Content-Type", '') and resp.ok: def process(path, key, value):
key = key.lower()
# clean up value and do type changes
if value:
try:
# convert to integer if needed
if value.isdigit():
value = int(value)
except:
pass
if key in ['banner', 'fanart', 'poster']:
value = self.config['url_artworkPrefix'] % (value)
else:
value = self._cleanData(value)
try:
if key == 'firstaired' and value in "0000-00-00":
new_value = str(dt.date.fromordinal(1))
new_value = re.sub("([-]0{2}){1,}", "", new_value)
fixDate = parse(new_value, fuzzy=True).date()
value = fixDate.strftime("%Y-%m-%d")
elif key == 'firstaired':
value = parse(value, fuzzy=True).date()
value = value.strftime("%Y-%m-%d")
except:
pass
value = self._cleanData(value)
return (key, value)
if resp.ok:
if 'application/zip' in resp.headers.get("Content-Type", ''):
try: try:
# TODO: The zip contains actors.xml and banners.xml, which are currently ignored [GH-20] # TODO: The zip contains actors.xml and banners.xml, which are currently ignored [GH-20]
log().debug("We recived a zip file unpacking now ...") log().debug("We recived a zip file unpacking now ...")
zipdata = StringIO.StringIO() zipdata = StringIO.StringIO()
zipdata.write(resp.content) zipdata.write(resp.content)
myzipfile = zipfile.ZipFile(zipdata) myzipfile = zipfile.ZipFile(zipdata)
return myzipfile.read('%s.xml' % language) return xmltodict.parse(myzipfile.read('%s.xml' % language), postprocessor=process)
except zipfile.BadZipfile: except zipfile.BadZipfile:
raise tvdb_error("Bad zip file received from thetvdb.com, could not read it") raise tvdb_error("Bad zip file received from thetvdb.com, could not read it")
else:
return resp.content if resp.ok else None return xmltodict.parse(resp.text.strip(), postprocessor=process)
def _getetsrc(self, url, params=None, language=None): def _getetsrc(self, url, params=None, language=None):
"""Loads a URL using caching, returns an ElementTree of the source """Loads a URL using caching, returns an ElementTree of the source
""" """
src = self._loadUrl(url, params=params, language=language)
try: try:
# TVDB doesn't sanitize \r (CR) from user input in some fields,
# remove it to avoid errors. Change from SickBeard, from will14m
return ElementTree.fromstring(src.rstrip("\r")) if src else None
except SyntaxError:
src = self._loadUrl(url, params=params, language=language) src = self._loadUrl(url, params=params, language=language)
try: src = [src[item] for item in src][0]
return ElementTree.fromstring(src.rstrip("\r")) if src else None except:
except SyntaxError, exceptionmsg: errormsg = "There was an error with the XML retrieved from thetvdb.com:"
errormsg = "There was an error with the XML retrieved from thetvdb.com:\n%s" % (
exceptionmsg
)
if self.config['cache_enabled']: if self.config['cache_enabled']:
errormsg += "\nFirst try emptying the cache folder at..\n%s" % ( errormsg += "\nFirst try emptying the cache folder at..\n%s" % (
@ -599,6 +628,8 @@ class Tvdb:
errormsg += "\nhttp://dbr.lighthouseapp.com/projects/13342-tvdb_api/overview\n" errormsg += "\nhttp://dbr.lighthouseapp.com/projects/13342-tvdb_api/overview\n"
raise tvdb_error(errormsg) raise tvdb_error(errormsg)
return src
def _setItem(self, sid, seas, ep, attrib, value): def _setItem(self, sid, seas, ep, attrib, value):
"""Creates a new episode, creating Show(), Season() and """Creates a new episode, creating Show(), Season() and
Episode()s as required. Called by _getShowData to populate show Episode()s as required. Called by _getShowData to populate show
@ -649,9 +680,8 @@ class Tvdb:
log().debug("Searching for show %s" % series) log().debug("Searching for show %s" % series)
self.config['params_getSeries']['seriesname'] = series self.config['params_getSeries']['seriesname'] = series
seriesEt = self._getetsrc(self.config['url_getSeries'], self.config['params_getSeries']) seriesEt = self._getetsrc(self.config['url_getSeries'], self.config['params_getSeries'])
allSeries = list(dict((s.tag.lower(), s.text) for s in x.getchildren()) for x in seriesEt)
return allSeries return [seriesEt[item] for item in seriesEt][0]
def _getSeries(self, series): def _getSeries(self, series):
"""This searches TheTVDB.com for the series name, """This searches TheTVDB.com for the series name,
@ -798,24 +828,13 @@ class Tvdb:
self.config['url_seriesInfo'] % (sid, getShowInLanguage) self.config['url_seriesInfo'] % (sid, getShowInLanguage)
) )
if seriesInfoEt is None: return False # check and make sure we have data to process and that it contains a series name
for curInfo in seriesInfoEt.findall("Series")[0]: if seriesInfoEt is None or 'seriesname' not in seriesInfoEt['series']:
tag = curInfo.tag.lower()
value = curInfo.text
if tag == 'seriesname' and value is None:
return False return False
if value is not None: for k, v in seriesInfoEt['series'].items():
if tag == 'id': self._setShowData(sid, k, v)
value = int(value)
if tag in ['banner', 'fanart', 'poster']:
value = self.config['url_artworkPrefix'] % (value)
else:
value = self._cleanData(value)
self._setShowData(sid, tag, value)
if seriesSearch: if seriesSearch:
return True return True
@ -837,63 +856,40 @@ class Tvdb:
epsEt = self._getetsrc(url, language=language) epsEt = self._getetsrc(url, language=language)
for cur_ep in epsEt.findall("Episode"): for cur_ep in epsEt["episode"]:
if self.config['dvdorder']: if self.config['dvdorder']:
log().debug('Using DVD ordering.') log().debug('Using DVD ordering.')
use_dvd = cur_ep.find('DVD_season').text != None and cur_ep.find('DVD_episodenumber').text != None use_dvd = cur_ep['dvd_season'] != None and cur_ep['dvd_episodenumber'] != None
else: else:
use_dvd = False use_dvd = False
if use_dvd: if use_dvd:
elem_seasnum, elem_epno = cur_ep.find('DVD_season'), cur_ep.find('DVD_episodenumber') seasnum, epno = cur_ep['dvd_season'], cur_ep['dvd_episodenumber']
else: else:
elem_seasnum, elem_epno = cur_ep.find('SeasonNumber'), cur_ep.find('EpisodeNumber') seasnum, epno = cur_ep['seasonnumber'], cur_ep['episodenumber']
if elem_seasnum is None or elem_epno is None:
if seasnum is None or epno is None:
log().warning("An episode has incomplete season/episode number (season: %r, episode: %r)" % ( log().warning("An episode has incomplete season/episode number (season: %r, episode: %r)" % (
elem_seasnum, elem_epno)) seasnum, epno))
log().debug(
" ".join(
"%r is %r" % (child.tag, child.text) for child in cur_ep.getchildren()))
# TODO: Should this happen?
continue # Skip to next episode continue # Skip to next episode
# float() is because https://github.com/dbr/tvnamer/issues/95 - should probably be fixed in TVDB data # float() is because https://github.com/dbr/tvnamer/issues/95 - should probably be fixed in TVDB data
seas_no = int(float(elem_seasnum.text)) seas_no = int(float(seasnum))
ep_no = int(float(elem_epno.text)) ep_no = int(float(epno))
useDVD = False for k,v in cur_ep.items():
k = k.lower()
if (self.config['dvdorder']): if v is not None:
log().debug('DVD Order? Yes') if k == 'id':
useDVD = (cur_ep.find('DVD_season').text != None and cur_ep.find('DVD_episodenumber').text != None) v = int(v)
if k == 'filename':
v = self.config['url_artworkPrefix'] % (v)
else: else:
log().debug('DVD Order? No') v = self._cleanData(v)
if (useDVD): self._setItem(sid, seas_no, ep_no, k, v)
log().debug('Use DVD Order? Yes')
seas_no = int(cur_ep.find('DVD_season').text)
ep_no = int(float(cur_ep.find('DVD_episodenumber').text))
else:
log().debug('Use DVD Order? No')
seas_no = int(cur_ep.find('SeasonNumber').text)
ep_no = int(cur_ep.find('EpisodeNumber').text)
for cur_item in cur_ep.getchildren():
tag = cur_item.tag.lower()
value = cur_item.text
if value is not None:
if tag == 'id':
value = int(value)
if tag == 'filename':
value = self.config['url_artworkPrefix'] % (value)
else:
value = self._cleanData(value)
self._setItem(sid, seas_no, ep_no, tag, value)
return True return True

View File

@ -24,6 +24,7 @@ import logging
import datetime as dt import datetime as dt
import requests import requests
import cachecontrol import cachecontrol
import xmltodict
try: try:
import xml.etree.cElementTree as ElementTree import xml.etree.cElementTree as ElementTree
@ -37,9 +38,11 @@ from tvrage_ui import BaseUI
from tvrage_exceptions import (tvrage_error, tvrage_userabort, tvrage_shownotfound, from tvrage_exceptions import (tvrage_error, tvrage_userabort, tvrage_shownotfound,
tvrage_seasonnotfound, tvrage_episodenotfound, tvrage_attributenotfound) tvrage_seasonnotfound, tvrage_episodenotfound, tvrage_attributenotfound)
def log(): def log():
return logging.getLogger("tvrage_api") return logging.getLogger("tvrage_api")
def retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=None): def retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=None):
"""Retry calling the decorated function using an exponential backoff. """Retry calling the decorated function using an exponential backoff.
@ -83,6 +86,7 @@ def retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=None):
return deco_retry return deco_retry
class ShowContainer(dict): class ShowContainer(dict):
"""Simple dict that holds a series of Show instances """Simple dict that holds a series of Show instances
""" """
@ -112,6 +116,7 @@ class ShowContainer(dict):
class Show(dict): class Show(dict):
"""Holds a dict of seasons, and show data. """Holds a dict of seasons, and show data.
""" """
def __init__(self): def __init__(self):
dict.__init__(self) dict.__init__(self)
self.data = {} self.data = {}
@ -261,8 +266,10 @@ class Episode(dict):
if cur_value.find(unicode(term).lower()) > -1: if cur_value.find(unicode(term).lower()) > -1:
return self return self
class TVRage: class TVRage:
"""Create easy-to-use interface to name of season/episode name""" """Create easy-to-use interface to name of season/episode name"""
def __init__(self, def __init__(self,
interactive=False, interactive=False,
select_first=False, select_first=False,
@ -390,9 +397,9 @@ class TVRage:
# get response from TVRage # get response from TVRage
if self.config['cache_enabled']: if self.config['cache_enabled']:
resp = self.sess.get(url, cache_auto=True, params=params) resp = self.sess.get(url.strip(), cache_auto=True, params=params)
else: else:
resp = requests.get(url, params=params) resp = requests.get(url.strip(), params=params)
except requests.HTTPError, e: except requests.HTTPError, e:
raise tvrage_error("HTTP error " + str(e.errno) + " while loading URL " + str(url)) raise tvrage_error("HTTP error " + str(e.errno) + " while loading URL " + str(url))
@ -403,12 +410,8 @@ class TVRage:
except requests.Timeout, e: except requests.Timeout, e:
raise tvrage_error("Connection timed out " + str(e.message) + " while loading URL " + str(url)) raise tvrage_error("Connection timed out " + str(e.message) + " while loading URL " + str(url))
return resp.content if resp.ok else None def remap_keys(path, key, value):
name_map = {
def _getetsrc(self, url, params=None):
"""Loads a URL using caching, returns an ElementTree of the source
"""
reDict = {
'showid': 'id', 'showid': 'id',
'showname': 'seriesname', 'showname': 'seriesname',
'name': 'seriesname', 'name': 'seriesname',
@ -422,54 +425,59 @@ class TVRage:
'title': 'episodename', 'title': 'episodename',
'airdate': 'firstaired', 'airdate': 'firstaired',
'screencap': 'filename', 'screencap': 'filename',
'seasonnum': 'episodenumber', 'seasonnum': 'episodenumber'
} }
robj = re.compile('|'.join(reDict.keys()))
src = self._loadUrl(url, params)
try: try:
# TVRAGE doesn't sanitize \r (CR) from user input in some fields, key = name_map[key.lower()]
# remove it to avoid errors. Change from SickBeard, from will14m except (ValueError, TypeError, KeyError):
xml = ElementTree.fromstring(src.rstrip("\r")) key.lower()
tree = ElementTree.ElementTree(xml)
for elm in tree.findall('.//*'): # clean up value and do type changes
elm.tag = robj.sub(lambda m: reDict[m.group(0)], elm.tag) if value:
if isinstance(value, dict):
if key == 'network':
value = value['#text']
if key == 'genre':
value = value['genre']
if not isinstance(value, list):
value = [value]
value = '|' + '|'.join(value) + '|'
if elm.tag in 'firstaired':
try: try:
if elm.text in "0000-00-00": # convert to integer if needed
elm.text = str(dt.date.fromordinal(1)) if value.isdigit():
elm.text = re.sub("([-]0{2}){1,}", "", elm.text) value = int(value)
fixDate = parse(elm.text, fuzzy=True).date()
elm.text = fixDate.strftime("%Y-%m-%d")
except: except:
pass pass
return ElementTree.fromstring(ElementTree.tostring(xml))
except SyntaxError:
src = self._loadUrl(url, params)
try:
xml = ElementTree.fromstring(src.rstrip("\r"))
tree = ElementTree.ElementTree(xml)
for elm in tree.findall('.//*'):
elm.tag = robj.sub(lambda m: reDict[m.group(0)], elm.tag)
if elm.tag in 'firstaired' and elm.text:
if elm.text == "0000-00-00":
elm.text = str(dt.date.fromordinal(1))
try: try:
#month = strptime(match.group('air_month')[:3],'%b').tm_mon if key == 'firstaired' and value in "0000-00-00":
#day = re.sub("(st|nd|rd|th)", "", match.group('air_day')) new_value = str(dt.date.fromordinal(1))
#dtStr = '%s/%s/%s' % (year, month, day) new_value = re.sub("([-]0{2}){1,}", "", new_value)
fixDate = parse(new_value, fuzzy=True).date()
fixDate = parse(elm.text, fuzzy=True) value = fixDate.strftime("%Y-%m-%d")
elm.text = fixDate.strftime("%Y-%m-%d") elif key == 'firstaired':
value = parse(value, fuzzy=True).date()
value = value.strftime("%Y-%m-%d")
except: except:
pass pass
return ElementTree.fromstring(ElementTree.tostring(xml))
except SyntaxError, exceptionmsg: value = self._cleanData(value)
errormsg = "There was an error with the XML retrieved from tvrage.com:\n%s" % ( return (key, value)
exceptionmsg
) if resp.ok:
return xmltodict.parse(resp.text.strip(), postprocessor=remap_keys)
def _getetsrc(self, url, params=None):
"""Loads a URL using caching, returns an ElementTree of the source
"""
try:
src = self._loadUrl(url, params)
src = [src[item] for item in src][0]
except:
errormsg = "There was an error with the XML retrieved from tvrage.com"
if self.config['cache_enabled']: if self.config['cache_enabled']:
errormsg += "\nFirst try emptying the cache folder at..\n%s" % ( errormsg += "\nFirst try emptying the cache folder at..\n%s" % (
@ -479,6 +487,8 @@ class TVRage:
errormsg += "\nIf this does not resolve the issue, please try again later. If the error persists, report a bug on\n" errormsg += "\nIf this does not resolve the issue, please try again later. If the error persists, report a bug on\n"
raise tvrage_error(errormsg) raise tvrage_error(errormsg)
return src
def _setItem(self, sid, seas, ep, attrib, value): def _setItem(self, sid, seas, ep, attrib, value):
"""Creates a new episode, creating Show(), Season() and """Creates a new episode, creating Show(), Season() and
Episode()s as required. Called by _getShowData to populate show Episode()s as required. Called by _getShowData to populate show
@ -529,9 +539,8 @@ class TVRage:
log().debug("Searching for show %s" % series) log().debug("Searching for show %s" % series)
self.config['params_getSeries']['show'] = series self.config['params_getSeries']['show'] = series
seriesEt = self._getetsrc(self.config['url_getSeries'], self.config['params_getSeries']) seriesEt = self._getetsrc(self.config['url_getSeries'], self.config['params_getSeries'])
allSeries = list(dict((s.tag.lower(),s.text) for s in x.getchildren()) for x in seriesEt)
return allSeries return [seriesEt[item] for item in seriesEt][0]
def _getSeries(self, series): def _getSeries(self, series):
"""This searches tvrage.com for the series name, """This searches tvrage.com for the series name,
@ -568,60 +577,47 @@ class TVRage:
self.config['params_seriesInfo'] self.config['params_seriesInfo']
) )
if seriesInfoEt is None: return False # check and make sure we have data to process and that it contains a series name
for curInfo in seriesInfoEt: if seriesInfoEt is None or 'seriesname' not in seriesInfoEt:
tag = curInfo.tag.lower()
value = curInfo.text
if tag == 'seriesname' and value is None:
return False return False
if tag == 'id': for k, v in seriesInfoEt.items():
value = int(value) self._setShowData(sid, k, v)
if value is not None: # series search ends here
value = self._cleanData(value) if seriesSearch:
return True
self._setShowData(sid, tag, value)
if seriesSearch: return True
try:
# Parse genre data
log().debug('Getting genres of %s' % (sid))
for genre in seriesInfoEt.find('genres'):
tag = genre.tag.lower()
value = genre.text
if value is not None:
value = self._cleanData(value)
self._setShowData(sid, tag, value)
except Exception:
log().debug('No genres for %s' % (sid))
# Parse episode data # Parse episode data
log().debug('Getting all episodes of %s' % (sid)) log().debug('Getting all episodes of %s' % (sid))
self.config['params_epInfo']['sid'] = sid self.config['params_epInfo']['sid'] = sid
epsEt = self._getetsrc(self.config['url_epInfo'], self.config['params_epInfo']) epsEt = self._getetsrc(self.config['url_epInfo'], self.config['params_epInfo'])
for cur_list in epsEt.findall("Episodelist"):
for cur_seas in cur_list: for season in epsEt['Episodelist']['Season']:
try: episodes = season['episode']
seas_no = int(cur_seas.attrib['no']) if not isinstance(episodes, list):
for cur_ep in cur_seas: episodes = [episodes]
ep_no = int(cur_ep.find('episodenumber').text)
for episode in episodes:
seas_no = int(season['@no'])
ep_no = int(episode['episodenumber'])
self._setItem(sid, seas_no, ep_no, 'seasonnumber', seas_no) self._setItem(sid, seas_no, ep_no, 'seasonnumber', seas_no)
for cur_item in cur_ep:
tag = cur_item.tag.lower()
value = cur_item.text for k,v in episode.items():
if value is not None: try:
if tag == 'id': k = k.lower()
value = int(value) if v is not None:
if k == 'link':
v = v.rsplit('/', 1)[1]
k = 'id'
value = self._cleanData(value) if k == 'id':
v = int(v)
self._setItem(sid, seas_no, ep_no, tag, value) v = self._cleanData(v)
self._setItem(sid, seas_no, ep_no, k, v)
except: except:
continue continue
return True return True
@ -673,11 +669,13 @@ def main():
grabs an episode name interactively. grabs an episode name interactively.
""" """
import logging import logging
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
tvrage_instance = TVRage(cache=False) tvrage_instance = TVRage(cache=False)
print tvrage_instance['Lost']['seriesname'] print tvrage_instance['Lost']['seriesname']
print tvrage_instance['Lost'][1][4]['episodename'] print tvrage_instance['Lost'][1][4]['episodename']
if __name__ == '__main__': if __name__ == '__main__':
main() main()

359
lib/xmltodict.py Normal file
View File

@ -0,0 +1,359 @@
#!/usr/bin/env python
"Makes working with XML feel like you are working with JSON"
from xml.parsers import expat
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import AttributesImpl
try: # pragma no cover
from cStringIO import StringIO
except ImportError: # pragma no cover
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
try: # pragma no cover
from collections import OrderedDict
except ImportError: # pragma no cover
try:
from ordereddict import OrderedDict
except ImportError:
OrderedDict = dict
try: # pragma no cover
_basestring = basestring
except NameError: # pragma no cover
_basestring = str
try: # pragma no cover
_unicode = unicode
except NameError: # pragma no cover
_unicode = str
__author__ = 'Martin Blech'
__version__ = '0.9.0'
__license__ = 'MIT'
class ParsingInterrupted(Exception):
pass
class _DictSAXHandler(object):
def __init__(self,
item_depth=0,
item_callback=lambda *args: True,
xml_attribs=True,
attr_prefix='@',
cdata_key='#text',
force_cdata=False,
cdata_separator='',
postprocessor=None,
dict_constructor=OrderedDict,
strip_whitespace=True,
namespace_separator=':',
namespaces=None):
self.path = []
self.stack = []
self.data = None
self.item = None
self.item_depth = item_depth
self.xml_attribs = xml_attribs
self.item_callback = item_callback
self.attr_prefix = attr_prefix
self.cdata_key = cdata_key
self.force_cdata = force_cdata
self.cdata_separator = cdata_separator
self.postprocessor = postprocessor
self.dict_constructor = dict_constructor
self.strip_whitespace = strip_whitespace
self.namespace_separator = namespace_separator
self.namespaces = namespaces
def _build_name(self, full_name):
if not self.namespaces:
return full_name
i = full_name.rfind(self.namespace_separator)
if i == -1:
return full_name
namespace, name = full_name[:i], full_name[i+1:]
short_namespace = self.namespaces.get(namespace, namespace)
if not short_namespace:
return name
else:
return self.namespace_separator.join((short_namespace, name))
def _attrs_to_dict(self, attrs):
if isinstance(attrs, dict):
return attrs
return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
def startElement(self, full_name, attrs):
name = self._build_name(full_name)
attrs = self._attrs_to_dict(attrs)
self.path.append((name, attrs or None))
if len(self.path) > self.item_depth:
self.stack.append((self.item, self.data))
if self.xml_attribs:
attrs = self.dict_constructor(
(self.attr_prefix+key, value)
for (key, value) in attrs.items())
else:
attrs = None
self.item = attrs or None
self.data = None
def endElement(self, full_name):
name = self._build_name(full_name)
if len(self.path) == self.item_depth:
item = self.item
if item is None:
item = self.data
should_continue = self.item_callback(self.path, item)
if not should_continue:
raise ParsingInterrupted()
if len(self.stack):
item, data = self.item, self.data
self.item, self.data = self.stack.pop()
if self.strip_whitespace and data is not None:
data = data.strip() or None
if data and self.force_cdata and item is None:
item = self.dict_constructor()
if item is not None:
if data:
self.push_data(item, self.cdata_key, data)
self.item = self.push_data(self.item, name, item)
else:
self.item = self.push_data(self.item, name, data)
else:
self.item = self.data = None
self.path.pop()
def characters(self, data):
if not self.data:
self.data = data
else:
self.data += self.cdata_separator + data
def push_data(self, item, key, data):
if self.postprocessor is not None:
result = self.postprocessor(self.path, key, data)
if result is None:
return item
key, data = result
if item is None:
item = self.dict_constructor()
try:
value = item[key]
if isinstance(value, list):
value.append(data)
else:
item[key] = [value, data]
except KeyError:
item[key] = data
return item
def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,
namespace_separator=':', **kwargs):
"""Parse the given XML input and convert it into a dictionary.
`xml_input` can either be a `string` or a file-like object.
If `xml_attribs` is `True`, element attributes are put in the dictionary
among regular child elements, using `@` as a prefix to avoid collisions. If
set to `False`, they are just ignored.
Simple example::
>>> import xmltodict
>>> doc = xmltodict.parse(\"\"\"
... <a prop="x">
... <b>1</b>
... <b>2</b>
... </a>
... \"\"\")
>>> doc['a']['@prop']
u'x'
>>> doc['a']['b']
[u'1', u'2']
If `item_depth` is `0`, the function returns a dictionary for the root
element (default behavior). Otherwise, it calls `item_callback` every time
an item at the specified depth is found and returns `None` in the end
(streaming mode).
The callback function receives two parameters: the `path` from the document
root to the item (name-attribs pairs), and the `item` (dict). If the
callback's return value is false-ish, parsing will be stopped with the
:class:`ParsingInterrupted` exception.
Streaming example::
>>> def handle(path, item):
... print 'path:%s item:%s' % (path, item)
... return True
...
>>> xmltodict.parse(\"\"\"
... <a prop="x">
... <b>1</b>
... <b>2</b>
... </a>\"\"\", item_depth=2, item_callback=handle)
path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2
The optional argument `postprocessor` is a function that takes `path`,
`key` and `value` as positional arguments and returns a new `(key, value)`
pair where both `key` and `value` may have changed. Usage example::
>>> def postprocessor(path, key, value):
... try:
... return key + ':int', int(value)
... except (ValueError, TypeError):
... return key, value
>>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
... postprocessor=postprocessor)
OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))])
You can pass an alternate version of `expat` (such as `defusedexpat`) by
using the `expat` parameter. E.g:
>>> import defusedexpat
>>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
OrderedDict([(u'a', u'hello')])
"""
handler = _DictSAXHandler(namespace_separator=namespace_separator,
**kwargs)
if isinstance(xml_input, _unicode):
if not encoding:
encoding = 'utf-8'
xml_input = xml_input.encode(encoding)
if not process_namespaces:
namespace_separator = None
parser = expat.ParserCreate(
encoding,
namespace_separator
)
try:
parser.ordered_attributes = True
except AttributeError:
# Jython's expat does not support ordered_attributes
pass
parser.StartElementHandler = handler.startElement
parser.EndElementHandler = handler.endElement
parser.CharacterDataHandler = handler.characters
parser.buffer_text = True
try:
parser.ParseFile(xml_input)
except (TypeError, AttributeError):
parser.Parse(xml_input, True)
return handler.item
def _emit(key, value, content_handler,
attr_prefix='@',
cdata_key='#text',
depth=0,
preprocessor=None,
pretty=False,
newl='\n',
indent='\t'):
if preprocessor is not None:
result = preprocessor(key, value)
if result is None:
return
key, value = result
if not isinstance(value, (list, tuple)):
value = [value]
if depth == 0 and len(value) > 1:
raise ValueError('document with multiple roots')
for v in value:
if v is None:
v = OrderedDict()
elif not isinstance(v, dict):
v = _unicode(v)
if isinstance(v, _basestring):
v = OrderedDict(((cdata_key, v),))
cdata = None
attrs = OrderedDict()
children = []
for ik, iv in v.items():
if ik == cdata_key:
cdata = iv
continue
if ik.startswith(attr_prefix):
attrs[ik[len(attr_prefix):]] = iv
continue
children.append((ik, iv))
if pretty:
content_handler.ignorableWhitespace(depth * indent)
content_handler.startElement(key, AttributesImpl(attrs))
if pretty and children:
content_handler.ignorableWhitespace(newl)
for child_key, child_value in children:
_emit(child_key, child_value, content_handler,
attr_prefix, cdata_key, depth+1, preprocessor,
pretty, newl, indent)
if cdata is not None:
content_handler.characters(cdata)
if pretty and children:
content_handler.ignorableWhitespace(depth * indent)
content_handler.endElement(key)
if pretty and depth:
content_handler.ignorableWhitespace(newl)
def unparse(input_dict, output=None, encoding='utf-8', full_document=True,
**kwargs):
"""Emit an XML document for the given `input_dict` (reverse of `parse`).
The resulting XML document is returned as a string, but if `output` (a
file-like object) is specified, it is written there instead.
Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
as XML node attributes, whereas keys equal to `cdata_key`
(default=`'#text'`) are treated as character data.
The `pretty` parameter (default=`False`) enables pretty-printing. In this
mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
can be customized with the `newl` and `indent` parameters.
"""
((key, value),) = input_dict.items()
must_return = False
if output is None:
output = StringIO()
must_return = True
content_handler = XMLGenerator(output, encoding)
if full_document:
content_handler.startDocument()
_emit(key, value, content_handler, **kwargs)
if full_document:
content_handler.endDocument()
if must_return:
value = output.getvalue()
try: # pragma no cover
value = value.decode(encoding)
except AttributeError: # pragma no cover
pass
return value
if __name__ == '__main__': # pragma: no cover
import sys
import marshal
(item_depth,) = sys.argv[1:]
item_depth = int(item_depth)
def handle_item(path, item):
marshal.dump((path, item), sys.stdout)
return True
try:
root = parse(sys.stdin,
item_depth=item_depth,
item_callback=handle_item,
dict_constructor=dict)
if item_depth == 0:
handle_item([], root)
except KeyboardInterrupt:
pass

View File

@ -782,15 +782,11 @@ class GenericMetadata():
# Try and get posters and fanart from TMDB # Try and get posters and fanart from TMDB
if image_url is None: if image_url is None:
for show_name in set(allPossibleShowNames(show_obj)):
if image_type in ('poster', 'poster_thumb'): if image_type in ('poster', 'poster_thumb'):
image_url = self._retrieve_show_images_from_tmdb(show_obj, poster=True) image_url = self._retrieve_show_images_from_tmdb(show_obj, poster=True)
elif image_type == 'fanart': elif image_type == 'fanart':
image_url = self._retrieve_show_images_from_tmdb(show_obj, backdrop=True) image_url = self._retrieve_show_images_from_tmdb(show_obj, backdrop=True)
if image_url:
break
if image_url: if image_url:
image_data = metadata_helpers.getShowImage(image_url, which) image_data = metadata_helpers.getShowImage(image_url, which)
return image_data return image_data
@ -965,8 +961,6 @@ class GenericMetadata():
return (indexer_id, name, indexer) return (indexer_id, name, indexer)
def _retrieve_show_images_from_tmdb(self, show, backdrop=False, poster=False): def _retrieve_show_images_from_tmdb(self, show, backdrop=False, poster=False):
tmdb_id = None
# get TMDB configuration info # get TMDB configuration info
tmdb = TMDB(sickbeard.TMDB_API_KEY) tmdb = TMDB(sickbeard.TMDB_API_KEY)
config = tmdb.Configuration() config = tmdb.Configuration()
@ -981,27 +975,14 @@ class GenericMetadata():
try: try:
search = tmdb.Search() search = tmdb.Search()
for result in search.collection({'query': show.name}) + search.tv({'query': show.name}): for show_name in set(allPossibleShowNames(show)):
tmdb_id = result['id'] for result in search.collection({'query': show_name})['results'] + search.tv({'query': show_name})['results']:
external_ids = tmdb.TV(tmdb_id).external_ids() if backdrop and result['backdrop_path']:
if show.indexerid in [external_ids['tvdb_id'], external_ids['tvrage_id']]: return "{0}{1}{2}".format(base_url, max_size, result['backdrop_path'])
break elif poster and result['poster_path']:
return "{0}{1}{2}".format(base_url, max_size, result['poster_path'])
if tmdb_id: except Exception, e:
images = tmdb.Collections(tmdb_id).images()
if len(images) > 0:
# get backdrop urls
if backdrop:
rel_path = images['backdrops'][0]['file_path']
url = "{0}{1}{2}".format(base_url, max_size, rel_path)
return url
# get poster urls
if poster:
rel_path = images['posters'][0]['file_path']
url = "{0}{1}{2}".format(base_url, max_size, rel_path)
return url
except:
pass pass
logger.log(u"Could not find any posters or background for " + show.name, logger.DEBUG) logger.log(u"Could not find any posters or background for " + show.name, logger.DEBUG)

View File

@ -829,7 +829,7 @@ class TVShow(object):
self.airs = myEp["airs_dayofweek"] + " " + myEp["airs_time"] self.airs = myEp["airs_dayofweek"] + " " + myEp["airs_time"]
if getattr(myEp, 'firstaired', None) is not None: if getattr(myEp, 'firstaired', None) is not None:
self.startyear = int(myEp["firstaired"].split('-')[0]) self.startyear = int(str(myEp["firstaired"]).split('-')[0])
self.status = getattr(myEp, 'status', '') self.status = getattr(myEp, 'status', '')
@ -855,7 +855,6 @@ class TVShow(object):
i = imdb.IMDb() i = imdb.IMDb()
imdbTv = i.get_movie(str(re.sub("[^0-9]", "", self.imdbid))) imdbTv = i.get_movie(str(re.sub("[^0-9]", "", self.imdbid)))
test = imdbTv.keys()
for key in filter(lambda x: x.replace('_', ' ') in imdbTv.keys(), imdb_info.keys()): for key in filter(lambda x: x.replace('_', ' ') in imdbTv.keys(), imdb_info.keys()):
# Store only the first value for string type # Store only the first value for string type
if type(imdb_info[key]) == type('') and type(imdbTv.get(key)) == type([]): if type(imdb_info[key]) == type('') and type(imdbTv.get(key)) == type([]):
@ -1556,7 +1555,7 @@ class TVEpisode(object):
self.deleteEpisode() self.deleteEpisode()
return False return False
if myEp["absolute_number"] == None or myEp["absolute_number"] == "": if getattr(myEp, 'absolute_number', None) is None:
logger.log(u"This episode (" + self.show.name + " - " + str(season) + "x" + str( logger.log(u"This episode (" + self.show.name + " - " + str(season) + "x" + str(
episode) + ") has no absolute number on " + sickbeard.indexerApi( episode) + ") has no absolute number on " + sickbeard.indexerApi(
self.indexer).name self.indexer).name
@ -1564,7 +1563,7 @@ class TVEpisode(object):
else: else:
logger.log( logger.log(
str(self.show.indexerid) + ": The absolute_number for " + str(season) + "x" + str(episode) + " is : " + str(self.show.indexerid) + ": The absolute_number for " + str(season) + "x" + str(episode) + " is : " +
myEp["absolute_number"], logger.DEBUG) str(myEp["absolute_number"]), logger.DEBUG)
self.absolute_number = int(myEp["absolute_number"]) self.absolute_number = int(myEp["absolute_number"])
self.name = getattr(myEp, 'episodename', "") self.name = getattr(myEp, 'episodename', "")
@ -1603,6 +1602,7 @@ class TVEpisode(object):
u"The show dir is missing, not bothering to change the episode statuses since it'd probably be invalid") u"The show dir is missing, not bothering to change the episode statuses since it'd probably be invalid")
return return
if self.location:
logger.log(str(self.show.indexerid) + u": Setting status for " + str(season) + "x" + str( logger.log(str(self.show.indexerid) + u": Setting status for " + str(season) + "x" + str(
episode) + " based on status " + str(self.status) + " and existence of " + self.location, logger.DEBUG) episode) + " based on status " + str(self.status) + " and existence of " + self.location, logger.DEBUG)