SickRage/lib/imdb/parser/http/characterParser.py

"""
parser.http.characterParser module (imdb package).

This module provides the classes (and the instances), used to parse
the IMDb pages on the akas.imdb.com server about a character.
E.g., for "Jesse James" the referred pages would be:
    main details:   http://www.imdb.com/character/ch0000001/
    biography:      http://www.imdb.com/character/ch0000001/bio
    ...and so on...

Copyright 2007-2009 Davide Alberani <da@erlug.linux.it>
               2008 H. Turgut Uyar <uyar@tekir.org>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
"""

import re
from utils import Attribute, Extractor, DOMParserBase, build_movie, \
                    analyze_imdbid
from personParser import DOMHTMLMaindetailsParser

from imdb.Movie import Movie

_personIDs = re.compile(r'/name/nm([0-9]{7})')
class DOMHTMLCharacterMaindetailsParser(DOMHTMLMaindetailsParser):
    """Parser for the "filmography" page of a given character.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bparser = DOMHTMLCharacterMaindetailsParser()
        result = bparser.parse(character_biography_html_string)
    """
    _containsObjects = True

    _film_attrs = [Attribute(key=None,
                      multi=True,
                      path={
                          'link': "./a[1]/@href",
                          'title': ".//text()",
                          'status': "./i/a//text()",
                          'roleID': "./a/@href"
                          },
                      postprocess=lambda x:
                          build_movie(x.get('title') or u'',
                              movieID=analyze_imdbid(x.get('link') or u''),
                              roleID=_personIDs.findall(x.get('roleID') or u''),
                              status=x.get('status') or None,
                              _parsingCharacter=True))]

    extractors = [
            Extractor(label='title',
                        path="//title",
                        attrs=Attribute(key='name',
                            path="./text()",
                            postprocess=lambda x: \
                                    x.replace(' (Character)', '').replace(
                                        '- Filmography by type', '').strip())),

            Extractor(label='headshot',
                        path="//a[@name='headshot']",
                        attrs=Attribute(key='headshot',
                            path="./img/@src")),

            Extractor(label='akas',
                        path="//div[h5='Alternate Names:']",
                        attrs=Attribute(key='akas',
                            path="./div//text()",
                            postprocess=lambda x: x.strip().split(' / '))),

            Extractor(label='filmography',
                        path="//div[@class='filmo'][not(h5)]/ol/li",
                        attrs=_film_attrs),

            Extractor(label='filmography sections',
                        group="//div[@class='filmo'][h5]",
                        group_key="./h5/a/text()",
                        group_key_normalize=lambda x: x.lower()[:-1],
                        path="./ol/li",
                        attrs=_film_attrs),
            ]

    preprocessors = [
            # Check that this doesn't cut "status"...
            (re.compile(r'<br>(\.\.\.|   ).+?</li>', re.I | re.M), '</li>')]


class DOMHTMLCharacterBioParser(DOMParserBase):
    """Parser for the "biography" page of a given character.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bparser = DOMHTMLCharacterBioParser()
        result = bparser.parse(character_biography_html_string)
    """
    _defGetRefs = True

    extractors = [
            Extractor(label='introduction',
                        path="//div[@id='_intro']",
                        attrs=Attribute(key='introduction',
                            path=".//text()",
                            postprocess=lambda x: x.strip())),

            Extractor(label='biography',
                        path="//span[@class='_biography']",
                        attrs=Attribute(key='biography',
                            multi=True,
                            path={
                                'info': "./preceding-sibling::h4[1]//text()",
                                'text': ".//text()"
                            },
                            postprocess=lambda x: u'%s: %s' % (
                                x.get('info').strip(),
                                x.get('text').replace('\n',
                                    ' ').replace('||', '\n\n').strip()))),
    ]

    preprocessors = [
        (re.compile('(<div id="swiki.2.3.1">)', re.I), r'\1<div id="_intro">'),
        (re.compile('(<a name="history">)\s*(<table .*?</table>)',
                    re.I | re.DOTALL),
         r'</div>\2\1</a>'),
        (re.compile('(<a name="[^"]+">)(<h4>)', re.I), r'</span>\1</a>\2'),
        (re.compile('(</h4>)</a>', re.I), r'\1<span class="_biography">'),
        (re.compile('<br/><br/>', re.I), r'||'),
        (re.compile('\|\|\n', re.I), r'</span>'),
        ]


class DOMHTMLCharacterQuotesParser(DOMParserBase):
    """Parser for the "quotes" page of a given character.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        qparser = DOMHTMLCharacterQuotesParser()
        result = qparser.parse(character_quotes_html_string)
    """
    _defGetRefs = True

    extractors = [
        Extractor(label='charquotes',
                    group="//h5",
                    group_key="./a/text()",
                    path="./following-sibling::div[1]",
                    attrs=Attribute(key=None,
                        path={'txt': ".//text()",
                              'movieID': ".//a[1]/@href"},
                        postprocess=lambda x: (analyze_imdbid(x['movieID']),
                                    x['txt'].strip().replace(':   ',
                                    ': ').replace(':  ', ': ').split('||'))))
    ]

    preprocessors = [
        (re.compile('(</h5>)', re.I), r'\1<div>'),
        (re.compile('\s*<br/><br/>\s*', re.I), r'||'),
        (re.compile('\|\|\s*(<hr/>)', re.I), r'</div>\1'),
        (re.compile('\s*<br/>\s*', re.I), r'::')
        ]

    def postprocess_data(self, data):
        if not data:
            return {}
        newData = {}
        for title in data:
            movieID, quotes = data[title]
            if movieID is None:
                movie = title
            else:
                movie = Movie(title=title, movieID=movieID,
                              accessSystem=self._as, modFunct=self._modFunct)
            newData[movie] = [quote.split('::') for quote in quotes]
        return {'quotes': newData}


from personParser import DOMHTMLSeriesParser

_OBJECTS = {
    'character_main_parser': ((DOMHTMLCharacterMaindetailsParser,),
                                {'kind': 'character'}),
    'character_series_parser': ((DOMHTMLSeriesParser,), None),
    'character_bio_parser': ((DOMHTMLCharacterBioParser,), None),
    'character_quotes_parser': ((DOMHTMLCharacterQuotesParser,), None)
}