parser.mobile package (imdb package).
This package provides the IMDbMobileAccessSystem class used to access
IMDb's data for mobile systems.
the imdb.IMDb function will return an instance of this class when
called with the 'accessSystem' argument set to "mobile".
Copyright 2005-2011 Davide Alberani .*? ', '
"""Return a list of Person objects, from the string s; items
are assumed to be separated by the sep string."""
names = s.split(sep)
pl = []
plappend = pl.append
counter = 1
for name in names:
pid = re_imdbID.findall(name)
if not pid: continue
characters = _getTagsWith(name, 'class="char"',
toClosure=True, maxRes=1)
chpids = []
if characters:
for ch in characters[0].split(' / '):
chid = re_imdbID.findall(ch)
if not chid:
if not chpids:
chpids = None
elif len(chpids) == 1:
chpids = chpids[0]
name = _unHtml(name)
# Catch unclosed tags.
gt_indx = name.find('>')
if gt_indx != -1:
name = name[gt_indx+1:].lstrip()
if not name: continue
if name.endswith('...'):
name = name[:-3]
p = build_person(name, personID=str(pid[0]), billingPos=counter,
modFunct=self._defModFunct, roleID=chpids,
counter += 1
return pl
def _search_movie(self, title, results):
##params = urllib.urlencode({'tt': 'on','mx': str(results),'q': title})
##params = 'q=%s&tt=on&mx=%s' % (urllib.quote_plus(title), str(results))
##cont = self._mretrieve(imdbURL_search % params)
cont = subXMLRefs(self._get_search_content('tt', title, results))
title = _findBetween(cont, '', '
', maxRes=1)
if years:
years[:] = _findBetween(years[0], 'TV series', '',
if years:
d['series years'] = years[0].strip()
air_date = _findBetween(cont, 'Original Air Date:', '',
if air_date:
air_date = air_date[0]
vi = air_date.find('(')
if vi != -1:
date = _unHtml(air_date[:vi]).strip()
if date != '????':
d['original air date'] = date
air_date = air_date[vi:]
season = _findBetween(air_date, 'Season', ',', maxRes=1)
if season:
season = season[0].strip()
try: season = int(season)
except: pass
if season or type(season) is _inttype:
d['season'] = season
episode = _findBetween(air_date, 'Episode', ')', maxRes=1)
if episode:
episode = episode[0].strip()
try: episode = int(episode)
except: pass
if episode or type(season) is _inttype:
d['episode'] = episode
direct = _findBetween(cont, 'Director', ('', '
', '',
if seasons:
d['number of seasons'] = seasons[0].count('|') + 1
creator = _findBetween(cont, 'Created by', ('class="tn15more"',
if direct:
direct = direct[0]
h5idx = direct.find('/h5>')
if h5idx != -1:
direct = direct[h5idx+4:]
direct = self._getPersons(direct)
if direct: d['director'] = direct
if kind in ('tv series', 'tv mini series', 'episode'):
if kind != 'episode':
seasons = _findBetween(cont, 'Seasons:
if not creator:
# They change 'Created by' to 'Creator' and viceversa
# from time to time...
# XXX: is 'Creators' also used?
creator = _findBetween(cont, 'Creator:',
('class="tn15more"', '',
'), maxRes=1)
if creator:
creator = creator[0]
if creator.find('tn15more'): creator = '%s>' % creator
creator = self._getPersons(creator)
if creator: d['creator'] = creator
writers = _findBetween(cont, 'Writer', ('', '
if writers:
writers = writers[0]
h5idx = writers.find('/h5>')
if h5idx != -1:
writers = writers[h5idx+4:]
writers = self._getPersons(writers)
if writers: d['writer'] = writers
cvurl = _getTagsWith(cont, 'name="poster"', toClosure=True, maxRes=1)
if cvurl:
cvurl = _findBetween(cvurl[0], 'src="', '"', maxRes=1)
if cvurl: d['cover url'] = cvurl[0]
genres = _findBetween(cont, 'href="/genre/', '"')
if genres:
d['genres'] = list(set(genres))
ur = _findBetween(cont, 'id="star-bar-user-rate">', '',
if ur:
rat = _findBetween(ur[0], '', '', maxRes=1)
if rat:
if rat:
d['rating'] = rat[0].strip()
self._mobile_logger.warn('wrong rating: %s', rat)
vi = ur[0].rfind('href="ratings"')
if vi != -1 and ur[0][vi+10:].find('await') == -1:
votes = _findBetween(ur[0][vi:], "title='",
" IMDb", maxRes=1)
votes = int(votes[0].replace(',', ''))
d['votes'] = votes
except (ValueError, IndexError):
self._mobile_logger.warn('wrong votes: %s', ur)
top250 = _findBetween(cont, 'href="/chart/top?', '', maxRes=1)
if top250:
fn = top250[0].rfind('#')
if fn != -1:
td = int(top250[0][fn+1:])
d['top 250 rank'] = td
except ValueError:
self._mobile_logger.warn('wrong top250: %s', top250)
castdata = _findBetween(cont, 'Cast overview', '', maxRes=1)
if not castdata:
castdata = _findBetween(cont, 'Credited cast', '', maxRes=1)
if not castdata:
castdata = _findBetween(cont, 'Complete credited cast', '',
if not castdata:
castdata = _findBetween(cont, 'Series Cast Summary', '',
if not castdata:
castdata = _findBetween(cont, 'Episode Credited cast', '',
if castdata:
castdata = castdata[0]
# Reintegrate the fist tag.
fl = castdata.find('href=')
if fl != -1: castdata = '')
if smib != -1:
smie = castdata.rfind(' ')
if smie != -1:
castdata = castdata[:smib].strip() + \
castdata = castdata.replace('/tr> ', '', maxRes=1)
if akas:
# For some reason, here '):
if spouse.count('') > 1:
spouse = spouse.replace('', '::', 1)
spouse = _unHtml(spouse)
spouse = spouse.replace(':: ', '::').strip()
if spouse: sl.append(spouse)
if sl: d['spouse'] = sl
nnames = _findBetween(cont, '
is still used in place of
akas[:] = [x for x in akas[0].split('
') if x.strip()]
akas = [_unHtml(x).replace('" - ','::', 1).lstrip('"').strip()
for x in akas]
if 'See more' in akas: akas.remove('See more')
akas[:] = [x for x in akas if x]
if akas:
d['akas'] = akas
mpaa = _findBetween(cont, 'MPAA:', '', maxRes=1)
if mpaa: d['mpaa'] = _unHtml(mpaa[0])
runtimes = _findBetween(cont, 'Runtime:', '', maxRes=1)
if runtimes:
runtimes = runtimes[0]
runtimes = [x.strip().replace(' min', '').replace(' (', '::(', 1)
for x in runtimes.split('|')]
d['runtimes'] = [_unHtml(x).strip() for x in runtimes]
if kind == 'episode':
# number of episodes.
epsn = _findBetween(cont, 'title="Full Episode List">', '',
if epsn:
epsn = epsn[0].replace(' Episodes', '').strip()
if epsn:
epsn = int(epsn)
self._mobile_logger.warn('wrong episodes #: %s', epsn)
d['number of episodes'] = epsn
country = _findBetween(cont, 'Country:', '', maxRes=1)
if country:
country[:] = country[0].split(' | ')
country[:] = ['', '::')) for x in country]
if country: d['countries'] = country
lang = _findBetween(cont, 'Language:', '', maxRes=1)
if lang:
lang[:] = lang[0].split(' | ')
lang[:] = ['', '::')) for x in lang]
if lang: d['languages'] = lang
col = _findBetween(cont, '"/search/title?colors=', '')
if col:
col[:] = col[0].split(' | ')
col[:] = ['', '::')) for x in col]
if col: d['color info'] = col
sm = _findBetween(cont, '/search/title?sound_mixes=', '',
if sm:
sm[:] = sm[0].split(' | ')
sm[:] = ['', '::')) for x in sm]
if sm: d['sound mix'] = sm
cert = _findBetween(cont, 'Certification:', '', maxRes=1)
if cert:
cert[:] = cert[0].split(' | ')
cert[:] = [_unHtml(x.replace(' ', '::')) for x in cert]
if cert: d['certificates'] = cert
plotoutline = _findBetween(cont, 'Plot:', [''],
if plotoutline:
plotoutline = plotoutline[0].strip()
plotoutline = plotoutline.rstrip('|').rstrip()
if plotoutline: d['plot outline'] = _unHtml(plotoutline)
aratio = _findBetween(cont, 'Aspect Ratio:', [''],
if aratio:
aratio = aratio[0].strip().replace(' (', '::(', 1)
if aratio:
d['aspect ratio'] = _unHtml(aratio)
return {'data': d}
def get_movie_plot(self, movieID):
cont = self._mretrieve(self.urls['movie_main'] % movieID + 'plotsummary')
plot = _findBetween(cont, '
birth name']:
sepIdx = li.find(sep)
if sepIdx != -1:
li = li[:sepIdx]
pid = re_imdbID.findall(li)
pname = _unHtml(li)
if not (pid and pname):
self._mobile_logger.debug('no name/personID parsing' \
' %s searching for name %s', li,
resd = analyze_name(pname, canonical=1)
if akas:
resd['akas'] = akas
res.append((str(pid[0]), resd))
return res
def get_person_main(self, personID, _parseChr=False):
if not _parseChr:
url = self.urls['person_main'] % personID + 'maindetails'
url = self.urls['character_main'] % personID
s = self._mretrieve(url)
r = {}
name = _findBetween(s, '
'), maxRes=1)
if date:
date = _unHtml(date[0])
if date:
#date, notes = date_and_notes(date)
# TODO: fix to handle real names.
date_notes = date.split(' in ', 1)
notes = u''
date = date_notes[0]
if len(date_notes) == 2:
notes = date_notes[1]
dtitle = 'birth'
if dKind == 'Died':
dtitle = 'death'
if date:
r['%s date' % dtitle] = date
if notes:
r['%s notes' % dtitle] = notes
akas = _findBetween(s, 'Alternate Names:', ('
'), maxRes=1)
if akas:
akas = akas[0]
if akas:
akas = _unHtml(akas)
if akas.find(' | ') != -1:
akas = akas.split(' | ')
akas = akas.split(' / ')
if akas: r['akas'] = filter(None, [x.strip() for x in akas])
hs = _findBetween(s, "rel='image_src'", '>', maxRes=1)
if not hs:
hs = _findBetween(s, 'rel="image_src"', '>', maxRes=1)
if not hs:
hs = _findBetween(s, '', maxRes=1)
if hs:
hsl = _findBetween(hs[0], "href='", "'", maxRes=1)
if not hsl:
hsl = _findBetween(hs[0], 'href="', '"', maxRes=1)
if hsl and 'imdb-share-logo' not in hsl[0]:
r['headshot'] = hsl[0]
# Build a list of tuples such [('hrefLink', 'section name')]
workkind = _findBetween(s, 'id="jumpto_', '')
ws = []
for work in workkind:
sep = '" >'
if '">' in work:
sep = '">'
wsplit = work.split(sep, 1)
if len(wsplit) == 2:
sect = wsplit[0]
if '"' in sect:
sect = sect[:sect.find('"')]
ws.append((sect, wsplit[1].lower()))
# XXX: I think "guest appearances" are gone.
if s.find(' tag.
if _parseChr and sect == 'filmography':
inisect = s.find('
', ' .... ', 1)
if not _parseChr:
chrIndx = m.find(' .... ')
chrIndx = m.find(' Played by ')
chids = []
if chrIndx != -1:
chrtxt = m[chrIndx+6:]
if _parseChr:
chrtxt = chrtxt[5:]
for ch in chrtxt.split(' / '):
chid = re_imdbID.findall(ch)
if not chid:
if not chids:
chids = None
elif len(chids) == 1:
chids = chids[0]
movieID = str(movieID[0])
# Search the status.
stidx = m.find('')
status = u''
if stidx != -1:
stendidx = m.rfind('')
if stendidx != -1:
status = _unHtml(m[stidx+3:stendidx])
m = m.replace(m[stidx+3:stendidx], '')
year = _findBetween(m, 'year_column">', '', maxRes=1)
if year:
year = year[0]
m = m.replace('%s' % year,
year = None
m = _unHtml(m)
if not m:
self._mobile_logger.warn('no title for movieID %s', movieID)
movie = build_movie(m, movieID=movieID, status=status,
roleID=chids, modFunct=self._defModFunct,
_parsingCharacter=_parseChr, year=year)
sectName = sectName.split(':')[0]
r.setdefault(sectName, []).append(movie)
# If available, take the always correct name from a form.
itag = _getTagsWith(s, 'NAME="primary"', maxRes=1)
if not itag:
itag = _getTagsWith(s, 'name="primary"', maxRes=1)
if itag:
vtag = _findBetween(itag[0], 'VALUE="', ('"', '>'), maxRes=1)
if not vtag:
vtag = _findBetween(itag[0], 'value="', ('"', '>'), maxRes=1)
if vtag:
vtag = unquote(str(vtag[0]))
vtag = unicode(vtag, 'latin_1')
except UnicodeEncodeError:
return {'data': r, 'info sets': ('main', 'filmography')}
def get_person_biography(self, personID):
cont = self._mretrieve(self.urls['person_main'] % personID + 'bio')
d = {}
spouses = _findBetween(cont, 'Spouse', ('', ''),
if spouses:
sl = []
for spouse in spouses[0].split('Nickname
', ('
if nnames:
nnames = nnames[0]
if nnames:
nnames = [x.strip().replace(' (', '::(', 1)
for x in nnames.split('
if nnames:
d['nick names'] = nnames
misc_sects = _findBetween(cont, '', '
') for x in misc_sects]
misc_sects[:] = [x for x in misc_sects if len(x) == 2]
for sect, data in misc_sects:
sect = sect.lower().replace(':', '').strip()
if d.has_key(sect) and sect != 'mini biography': continue
elif sect in ('spouse', 'nickname'): continue
if sect == 'salary': sect = 'salary history'
elif sect == 'where are they now': sect = 'where now'
elif sect == 'personal quotes': sect = 'quotes'
data = data.replace('
misc_sects[:] = [x.split('
', '::')
data = data.replace('
', ' ') # for multi-paragraphs 'bio'
data = data.replace('