mirror of
https://github.com/moparisthebest/SickRage
synced 2024-11-06 01:15:05 -05:00
92 lines
3.4 KiB
Python
92 lines
3.4 KiB
Python
|
"""
|
||
|
parser.http.companyParser module (imdb package).
|
||
|
|
||
|
This module provides the classes (and the instances), used to parse
|
||
|
the IMDb pages on the akas.imdb.com server about a company.
|
||
|
E.g., for "Columbia Pictures [us]" the referred page would be:
|
||
|
main details: http://akas.imdb.com/company/co0071509/
|
||
|
|
||
|
Copyright 2008-2009 Davide Alberani <da@erlug.linux.it>
|
||
|
2008 H. Turgut Uyar <uyar@tekir.org>
|
||
|
|
||
|
This program is free software; you can redistribute it and/or modify
|
||
|
it under the terms of the GNU General Public License as published by
|
||
|
the Free Software Foundation; either version 2 of the License, or
|
||
|
(at your option) any later version.
|
||
|
|
||
|
This program is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
GNU General Public License for more details.
|
||
|
|
||
|
You should have received a copy of the GNU General Public License
|
||
|
along with this program; if not, write to the Free Software
|
||
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||
|
"""
|
||
|
|
||
|
import re
|
||
|
from utils import build_movie, Attribute, Extractor, DOMParserBase, \
|
||
|
analyze_imdbid
|
||
|
|
||
|
from imdb.utils import analyze_company_name
|
||
|
|
||
|
|
||
|
class DOMCompanyParser(DOMParserBase):
|
||
|
"""Parser for the main page of a given company.
|
||
|
The page should be provided as a string, as taken from
|
||
|
the akas.imdb.com server. The final result will be a
|
||
|
dictionary, with a key for every relevant section.
|
||
|
|
||
|
Example:
|
||
|
cparser = DOMCompanyParser()
|
||
|
result = cparser.parse(company_html_string)
|
||
|
"""
|
||
|
_containsObjects = True
|
||
|
|
||
|
extractors = [
|
||
|
Extractor(label='name',
|
||
|
path="//title",
|
||
|
attrs=Attribute(key='name',
|
||
|
path="./text()",
|
||
|
postprocess=lambda x: \
|
||
|
analyze_company_name(x, stripNotes=True))),
|
||
|
|
||
|
Extractor(label='filmography',
|
||
|
group="//b/a[@name]",
|
||
|
group_key="./text()",
|
||
|
group_key_normalize=lambda x: x.lower(),
|
||
|
path="../following-sibling::ol[1]/li",
|
||
|
attrs=Attribute(key=None,
|
||
|
multi=True,
|
||
|
path={
|
||
|
'link': "./a[1]/@href",
|
||
|
'title': "./a[1]/text()",
|
||
|
'year': "./text()[1]"
|
||
|
},
|
||
|
postprocess=lambda x:
|
||
|
build_movie(u'%s %s' % \
|
||
|
(x.get('title'), x.get('year').strip()),
|
||
|
movieID=analyze_imdbid(x.get('link') or u''),
|
||
|
_parsingCompany=True))),
|
||
|
]
|
||
|
|
||
|
preprocessors = [
|
||
|
(re.compile('(<b><a name=)', re.I), r'</p>\1')
|
||
|
]
|
||
|
|
||
|
def postprocess_data(self, data):
|
||
|
for key in data.keys():
|
||
|
new_key = key.replace('company', 'companies')
|
||
|
new_key = new_key.replace('other', 'miscellaneous')
|
||
|
new_key = new_key.replace('distributor', 'distributors')
|
||
|
if new_key != key:
|
||
|
data[new_key] = data[key]
|
||
|
del data[key]
|
||
|
return data
|
||
|
|
||
|
|
||
|
_OBJECTS = {
|
||
|
'company_main_parser': ((DOMCompanyParser,), None)
|
||
|
}
|
||
|
|