SickRage/lib/guessit/matchtree.py

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2011 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

from __future__ import unicode_literals
from guessit import UnicodeMixin, base_text_type, Guess
from guessit.textutils import clean_string, str_fill
from guessit.patterns import group_delimiters
from guessit.guess import (merge_similar_guesses, merge_all,
                           choose_int, choose_string)
import copy
import logging

log = logging.getLogger(__name__)


class BaseMatchTree(UnicodeMixin):
    """A MatchTree represents the hierarchical split of a string into its
    constituent semantic groups."""

    def __init__(self, string='', span=None, parent=None):
        self.string = string
        self.span = span or (0, len(string))
        self.parent = parent
        self.children = []
        self.guess = Guess()

    @property
    def value(self):
        return self.string[self.span[0]:self.span[1]]

    @property
    def clean_value(self):
        return clean_string(self.value)

    @property
    def offset(self):
        return self.span[0]

    @property
    def info(self):
        result = dict(self.guess)

        for c in self.children:
            result.update(c.info)

        return result

    @property
    def root(self):
        if not self.parent:
            return self

        return self.parent.root

    @property
    def depth(self):
        if self.is_leaf():
            return 0

        return 1 + max(c.depth for c in self.children)

    def is_leaf(self):
        return self.children == []

    def add_child(self, span):
        child = MatchTree(self.string, span=span, parent=self)
        self.children.append(child)

    def partition(self, indices):
        indices = sorted(indices)
        if indices[0] != 0:
            indices.insert(0, 0)
        if indices[-1] != len(self.value):
            indices.append(len(self.value))

        for start, end in zip(indices[:-1], indices[1:]):
            self.add_child(span=(self.offset + start,
                                 self.offset + end))

    def split_on_components(self, components):
        offset = 0
        for c in components:
            start = self.value.find(c, offset)
            end = start + len(c)
            self.add_child(span=(self.offset + start,
                                 self.offset + end))
            offset = end

    def nodes_at_depth(self, depth):
        if depth == 0:
            yield self

        for child in self.children:
            for node in child.nodes_at_depth(depth - 1):
                yield node

    @property
    def node_idx(self):
        if self.parent is None:
            return ()
        return self.parent.node_idx + (self.parent.children.index(self),)

    def node_at(self, idx):
        if not idx:
            return self

        try:
            return self.children[idx[0]].node_at(idx[1:])
        except:
            raise ValueError('Non-existent node index: %s' % (idx,))

    def nodes(self):
        yield self
        for child in self.children:
            for node in child.nodes():
                yield node

    def _leaves(self):
        if self.is_leaf():
            yield self
        else:
            for child in self.children:
                # pylint: disable=W0212
                for leaf in child._leaves():
                    yield leaf

    def leaves(self):
        return list(self._leaves())

    def to_string(self):
        empty_line = ' ' * len(self.string)

        def to_hex(x):
            if isinstance(x, int):
                return str(x) if x < 10 else chr(55 + x)
            return x

        def meaning(result):
            mmap = { 'episodeNumber': 'E',
                     'season': 'S',
                     'extension': 'e',
                     'format': 'f',
                     'language': 'l',
                     'country': 'C',
                     'videoCodec': 'v',
                     'audioCodec': 'a',
                     'website': 'w',
                     'container': 'c',
                     'series': 'T',
                     'title': 't',
                     'date': 'd',
                     'year': 'y',
                     'releaseGroup': 'r',
                     'screenSize': 's'
                     }

            if result is None:
                return ' '

            for prop, l in mmap.items():
                if prop in result:
                    return l

            return 'x'

        lines = [ empty_line ] * (self.depth + 2) # +2: remaining, meaning
        lines[-2] = self.string

        for node in self.nodes():
            if node == self:
                continue

            idx = node.node_idx
            depth = len(idx) - 1
            if idx:
                lines[depth] = str_fill(lines[depth], node.span,
                                        to_hex(idx[-1]))
            if node.guess:
                lines[-2] = str_fill(lines[-2], node.span, '_')
                lines[-1] = str_fill(lines[-1], node.span, meaning(node.guess))

        lines.append(self.string)

        return '\n'.join(lines)

    def __unicode__(self):
        return self.to_string()


class MatchTree(BaseMatchTree):
    """The MatchTree contains a few "utility" methods which are not necessary
    for the BaseMatchTree, but add a lot of convenience for writing
    higher-level rules."""

    def _unidentified_leaves(self,
                             valid=lambda leaf: len(leaf.clean_value) >= 2):
        for leaf in self._leaves():
            if not leaf.guess and valid(leaf):
                yield leaf

    def unidentified_leaves(self,
                            valid=lambda leaf: len(leaf.clean_value) >= 2):
        return list(self._unidentified_leaves(valid))

    def _leaves_containing(self, property_name):
        if isinstance(property_name, base_text_type):
            property_name = [ property_name ]

        for leaf in self._leaves():
            for prop in property_name:
                if prop in leaf.guess:
                    yield leaf
                    break

    def leaves_containing(self, property_name):
        return list(self._leaves_containing(property_name))

    def first_leaf_containing(self, property_name):
        try:
            return next(self._leaves_containing(property_name))
        except StopIteration:
            return None

    def _previous_unidentified_leaves(self, node):
        node_idx = node.node_idx
        for leaf in self._unidentified_leaves():
            if leaf.node_idx < node_idx:
                yield leaf

    def previous_unidentified_leaves(self, node):
        return list(self._previous_unidentified_leaves(node))

    def _previous_leaves_containing(self, node, property_name):
        node_idx = node.node_idx
        for leaf in self._leaves_containing(property_name):
            if leaf.node_idx < node_idx:
                yield leaf

    def previous_leaves_containing(self, node, property_name):
        return list(self._previous_leaves_containing(node, property_name))

    def is_explicit(self):
        """Return whether the group was explicitly enclosed by
        parentheses/square brackets/etc."""
        return (self.value[0] + self.value[-1]) in group_delimiters

    def matched(self):
        # we need to make a copy here, as the merge functions work in place and
        # calling them on the match tree would modify it
        parts = [node.guess for node in self.nodes() if node.guess]
        parts = copy.deepcopy(parts)

        # 1- try to merge similar information together and give it a higher
        #    confidence
        for int_part in ('year', 'season', 'episodeNumber'):
            merge_similar_guesses(parts, int_part, choose_int)

        for string_part in ('title', 'series', 'container', 'format',
                            'releaseGroup', 'website', 'audioCodec',
                            'videoCodec', 'screenSize', 'episodeFormat',
                            'audioChannels', 'idNumber'):
            merge_similar_guesses(parts, string_part, choose_string)

        # 2- merge the rest, potentially discarding information not properly
        #    merged before
        result = merge_all(parts,
                           append=['language', 'subtitleLanguage', 'other'])

        log.debug('Final result: ' + result.nice_string())
        return result