SickRage/lib/guessit/__init__.py

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2011 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

from __future__ import unicode_literals

__version__ = '0.7.dev0'
__all__ = ['Guess', 'Language',
           'guess_file_info', 'guess_video_info',
           'guess_movie_info', 'guess_episode_info']


# Do python3 detection before importing any other module, to be sure that
# it will then always be available
# with code from http://lucumr.pocoo.org/2011/1/22/forwards-compatible-python/
import sys
if sys.version_info[0] >= 3:
    PY3 = True
    unicode_text_type = str
    native_text_type = str
    base_text_type = str
    def u(x):
        return str(x)
    def s(x):
        return x
    class UnicodeMixin(object):
        __str__ = lambda x: x.__unicode__()
    import binascii
    def to_hex(x):
        return binascii.hexlify(x).decode('utf-8')

else:
    PY3 = False
    __all__ = [ str(s) for s in __all__ ] # fix imports for python2
    unicode_text_type = unicode
    native_text_type = str
    base_text_type = basestring
    def u(x):
        if isinstance(x, str):
            return x.decode('utf-8')
        return unicode(x)
    def s(x):
        if isinstance(x, unicode):
            return x.encode('utf-8')
        if isinstance(x, list):
            return [ s(y) for y in x ]
        if isinstance(x, tuple):
            return tuple(s(y) for y in x)
        if isinstance(x, dict):
            return dict((s(key), s(value)) for key, value in x.items())
        return x
    class UnicodeMixin(object):
        __str__ = lambda x: unicode(x).encode('utf-8')
    def to_hex(x):
        return x.encode('hex')


from guessit.guess import Guess, merge_all
from guessit.language import Language
from guessit.matcher import IterativeMatcher
from guessit.textutils import clean_string
import logging

log = logging.getLogger(__name__)


class NullHandler(logging.Handler):
    def emit(self, record):
        pass

# let's be a nicely behaving library
h = NullHandler()
log.addHandler(h)


def _guess_filename(filename, filetype):
    def find_nodes(tree, props):
        """Yields all nodes containing any of the given props."""
        if isinstance(props, base_text_type):
            props = [props]
        for node in tree.nodes():
            if any(prop in node.guess for prop in props):
                yield node

    def warning(title):
        log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))
        return m

    mtree = IterativeMatcher(filename, filetype=filetype)

    # if there are multiple possible years found, we assume the first one is
    # part of the title, reparse the tree taking this into account
    years = set(n.value for n in find_nodes(mtree.match_tree, 'year'))
    if len(years) >= 2:
        mtree = IterativeMatcher(filename, filetype=filetype,
                                 opts=['skip_first_year'])


    m = mtree.matched()

    if 'language' not in m and 'subtitleLanguage' not in m:
        return m

    # if we found some language, make sure we didn't cut a title or sth...
    mtree2 = IterativeMatcher(filename, filetype=filetype,
                              opts=['nolanguage', 'nocountry'])
    m2 = mtree2.matched()


    if m.get('title') is None:
        return m

    if m.get('title') != m2.get('title'):
        title = next(find_nodes(mtree.match_tree, 'title'))
        title2 = next(find_nodes(mtree2.match_tree, 'title'))

        langs = list(find_nodes(mtree.match_tree, ['language', 'subtitleLanguage']))
        if not langs:
            return warning('A weird error happened with language detection')

        # find the language that is likely more relevant
        for lng in langs:
            if lng.value in title2.value:
                # if the language was detected as part of a potential title,
                # look at this one in particular
                lang = lng
                break
        else:
            # pick the first one if we don't have a better choice
            lang = langs[0]


        # language code are rarely part of a title, and those
        # should be handled by the Language exceptions anyway
        if len(lang.value) <= 3:
            return m


        # if filetype is subtitle and the language appears last, just before
        # the extension, then it is likely a subtitle language
        parts = clean_string(title.root.value).split()
        if (m['type'] in ['moviesubtitle', 'episodesubtitle'] and
            parts.index(lang.value) == len(parts) - 2):
            return m

        # if the language was in the middle of the other potential title,
        # keep the other title (eg: The Italian Job), except if it is at the
        # very beginning, in which case we consider it an error
        if m2['title'].startswith(lang.value):
            return m
        elif lang.value in title2.value:
            return m2

        # if a node is in an explicit group, then the correct title is probably
        # the other one
        if title.root.node_at(title.node_idx[:2]).is_explicit():
            return m2
        elif title2.root.node_at(title2.node_idx[:2]).is_explicit():
            return m

        return warning('Not sure of the title because of the language position')


    return m


def guess_file_info(filename, filetype, info=None):
    """info can contain the names of the various plugins, such as 'filename' to
    detect filename info, or 'hash_md5' to get the md5 hash of the file.

    >>> guess_file_info('tests/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1'])
    {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'}
    """
    result = []
    hashers = []

    # Force unicode as soon as possible
    filename = u(filename)

    if info is None:
        info = ['filename']

    if isinstance(info, base_text_type):
        info = [info]

    for infotype in info:
        if infotype == 'filename':
            result.append(_guess_filename(filename, filetype))

        elif infotype == 'hash_mpc':
            from guessit.hash_mpc import hash_file
            try:
                result.append(Guess({'hash_mpc': hash_file(filename)},
                                    confidence=1.0))
            except Exception as e:
                log.warning('Could not compute MPC-style hash because: %s' % e)

        elif infotype == 'hash_ed2k':
            from guessit.hash_ed2k import hash_file
            try:
                result.append(Guess({'hash_ed2k': hash_file(filename)},
                                    confidence=1.0))
            except Exception as e:
                log.warning('Could not compute ed2k hash because: %s' % e)

        elif infotype.startswith('hash_'):
            import hashlib
            hashname = infotype[5:]
            try:
                hasher = getattr(hashlib, hashname)()
                hashers.append((infotype, hasher))
            except AttributeError:
                log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname)

        else:
            log.warning('Invalid infotype: %s' % infotype)

    # do all the hashes now, but on a single pass
    if hashers:
        try:
            blocksize = 8192
            hasherobjs = dict(hashers).values()

            with open(filename, 'rb') as f:
                chunk = f.read(blocksize)
                while chunk:
                    for hasher in hasherobjs:
                        hasher.update(chunk)
                    chunk = f.read(blocksize)

            for infotype, hasher in hashers:
                result.append(Guess({infotype: hasher.hexdigest()},
                                    confidence=1.0))
        except Exception as e:
            log.warning('Could not compute hash because: %s' % e)

    result = merge_all(result)

    # last minute adjustments

    # if country is in the guessed properties, make it part of the filename
    if 'series' in result and 'country' in result:
        result['series'] += ' (%s)' % result['country'].alpha2.upper()


    return result


def guess_video_info(filename, info=None):
    return guess_file_info(filename, 'autodetect', info)


def guess_movie_info(filename, info=None):
    return guess_file_info(filename, 'movie', info)


def guess_episode_info(filename, info=None):
    return guess_file_info(filename, 'episode', info)
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 01:18:05 -04:00			`#!/usr/bin/env python2`
			`# -- coding: utf-8 --`
			`#`
			`# GuessIt - A library for guessing information from filenames`
			`# Copyright (c) 2011 Nicolas Wack <wackou@gmail.com>`
			`#`
			`# GuessIt is free software; you can redistribute it and/or modify it under`
			`# the terms of the Lesser GNU General Public License as published by`
			`# the Free Software Foundation; either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# GuessIt is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# Lesser GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the Lesser GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`#`

			`from __future__ import unicode_literals`

			`__version__ = '0.7.dev0'`
			`__all__ = ['Guess', 'Language',`
			`'guess_file_info', 'guess_video_info',`
			`'guess_movie_info', 'guess_episode_info']`


			`# Do python3 detection before importing any other module, to be sure that`
			`# it will then always be available`
			`# with code from http://lucumr.pocoo.org/2011/1/22/forwards-compatible-python/`
			`import sys`
			`if sys.version_info[0] >= 3:`
			`PY3 = True`
			`unicode_text_type = str`
			`native_text_type = str`
			`base_text_type = str`
			`def u(x):`
			`return str(x)`
			`def s(x):`
			`return x`
			`class UnicodeMixin(object):`
			`__str__ = lambda x: x.__unicode__()`
			`import binascii`
			`def to_hex(x):`
			`return binascii.hexlify(x).decode('utf-8')`

			`else:`
			`PY3 = False`
			`__all__ = [ str(s) for s in __all__ ] # fix imports for python2`
			`unicode_text_type = unicode`
			`native_text_type = str`
			`base_text_type = basestring`
			`def u(x):`
			`if isinstance(x, str):`
			`return x.decode('utf-8')`
			`return unicode(x)`
			`def s(x):`
			`if isinstance(x, unicode):`
			`return x.encode('utf-8')`
			`if isinstance(x, list):`
			`return [ s(y) for y in x ]`
			`if isinstance(x, tuple):`
			`return tuple(s(y) for y in x)`
			`if isinstance(x, dict):`
			`return dict((s(key), s(value)) for key, value in x.items())`
			`return x`
			`class UnicodeMixin(object):`
			`__str__ = lambda x: unicode(x).encode('utf-8')`
			`def to_hex(x):`
			`return x.encode('hex')`


			`from guessit.guess import Guess, merge_all`
			`from guessit.language import Language`
			`from guessit.matcher import IterativeMatcher`
			`from guessit.textutils import clean_string`
			`import logging`

			`log = logging.getLogger(__name__)`



			`class NullHandler(logging.Handler):`
			`def emit(self, record):`
			`pass`

			`# let's be a nicely behaving library`
			`h = NullHandler()`
			`log.addHandler(h)`


			`def _guess_filename(filename, filetype):`
			`def find_nodes(tree, props):`
			`"""Yields all nodes containing any of the given props."""`
			`if isinstance(props, base_text_type):`
			`props = [props]`
			`for node in tree.nodes():`
			`if any(prop in node.guess for prop in props):`
			`yield node`

			`def warning(title):`
			`log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))`
			`return m`

			`mtree = IterativeMatcher(filename, filetype=filetype)`

			`# if there are multiple possible years found, we assume the first one is`
			`# part of the title, reparse the tree taking this into account`
			`years = set(n.value for n in find_nodes(mtree.match_tree, 'year'))`
			`if len(years) >= 2:`
			`mtree = IterativeMatcher(filename, filetype=filetype,`
			`opts=['skip_first_year'])`


			`m = mtree.matched()`

			`if 'language' not in m and 'subtitleLanguage' not in m:`
			`return m`

			`# if we found some language, make sure we didn't cut a title or sth...`
			`mtree2 = IterativeMatcher(filename, filetype=filetype,`
			`opts=['nolanguage', 'nocountry'])`
			`m2 = mtree2.matched()`


			`if m.get('title') is None:`
			`return m`

			`if m.get('title') != m2.get('title'):`
			`title = next(find_nodes(mtree.match_tree, 'title'))`
			`title2 = next(find_nodes(mtree2.match_tree, 'title'))`

			`langs = list(find_nodes(mtree.match_tree, ['language', 'subtitleLanguage']))`
			`if not langs:`
			`return warning('A weird error happened with language detection')`

			`# find the language that is likely more relevant`
			`for lng in langs:`
			`if lng.value in title2.value:`
			`# if the language was detected as part of a potential title,`
			`# look at this one in particular`
			`lang = lng`
			`break`
			`else:`
			`# pick the first one if we don't have a better choice`
			`lang = langs[0]`


			`# language code are rarely part of a title, and those`
			`# should be handled by the Language exceptions anyway`
			`if len(lang.value) <= 3:`
			`return m`


			`# if filetype is subtitle and the language appears last, just before`
			`# the extension, then it is likely a subtitle language`
			`parts = clean_string(title.root.value).split()`
			`if (m['type'] in ['moviesubtitle', 'episodesubtitle'] and`
			`parts.index(lang.value) == len(parts) - 2):`
			`return m`

			`# if the language was in the middle of the other potential title,`
			`# keep the other title (eg: The Italian Job), except if it is at the`
			`# very beginning, in which case we consider it an error`
			`if m2['title'].startswith(lang.value):`
			`return m`
			`elif lang.value in title2.value:`
			`return m2`

			`# if a node is in an explicit group, then the correct title is probably`
			`# the other one`
			`if title.root.node_at(title.node_idx[:2]).is_explicit():`
			`return m2`
			`elif title2.root.node_at(title2.node_idx[:2]).is_explicit():`
			`return m`

			`return warning('Not sure of the title because of the language position')`


			`return m`


			`def guess_file_info(filename, filetype, info=None):`
			`"""info can contain the names of the various plugins, such as 'filename' to`
			`detect filename info, or 'hash_md5' to get the md5 hash of the file.`

			`>>> guess_file_info('tests/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1'])`
			`{'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'}`
			`"""`
			`result = []`
			`hashers = []`

			`# Force unicode as soon as possible`
			`filename = u(filename)`

			`if info is None:`
			`info = ['filename']`

			`if isinstance(info, base_text_type):`
			`info = [info]`

			`for infotype in info:`
			`if infotype == 'filename':`
			`result.append(_guess_filename(filename, filetype))`

			`elif infotype == 'hash_mpc':`
			`from guessit.hash_mpc import hash_file`
			`try:`
			`result.append(Guess({'hash_mpc': hash_file(filename)},`
			`confidence=1.0))`
			`except Exception as e:`
			`log.warning('Could not compute MPC-style hash because: %s' % e)`

			`elif infotype == 'hash_ed2k':`
			`from guessit.hash_ed2k import hash_file`
			`try:`
			`result.append(Guess({'hash_ed2k': hash_file(filename)},`
			`confidence=1.0))`
			`except Exception as e:`
			`log.warning('Could not compute ed2k hash because: %s' % e)`

			`elif infotype.startswith('hash_'):`
			`import hashlib`
			`hashname = infotype[5:]`
			`try:`
			`hasher = getattr(hashlib, hashname)()`
			`hashers.append((infotype, hasher))`
			`except AttributeError:`
			`log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname)`

			`else:`
			`log.warning('Invalid infotype: %s' % infotype)`

			`# do all the hashes now, but on a single pass`
			`if hashers:`
			`try:`
			`blocksize = 8192`
			`hasherobjs = dict(hashers).values()`

			`with open(filename, 'rb') as f:`
			`chunk = f.read(blocksize)`
			`while chunk:`
			`for hasher in hasherobjs:`
			`hasher.update(chunk)`
			`chunk = f.read(blocksize)`

			`for infotype, hasher in hashers:`
			`result.append(Guess({infotype: hasher.hexdigest()},`
			`confidence=1.0))`
			`except Exception as e:`
			`log.warning('Could not compute hash because: %s' % e)`

			`result = merge_all(result)`

			`# last minute adjustments`

			`# if country is in the guessed properties, make it part of the filename`
			`if 'series' in result and 'country' in result:`
			`result['series'] += ' (%s)' % result['country'].alpha2.upper()`


			`return result`


			`def guess_video_info(filename, info=None):`
			`return guess_file_info(filename, 'autodetect', info)`


			`def guess_movie_info(filename, info=None):`
			`return guess_file_info(filename, 'movie', info)`


			`def guess_episode_info(filename, info=None):`
			`return guess_file_info(filename, 'episode', info)`