# -*- coding: utf-8 -*- import os import sys import codecs try: from collections import UserList except ImportError: from UserList import UserList from itertools import chain from copy import copy from pysrt.srtexc import Error from pysrt.srtitem import SubRipItem from pysrt.compat import str BOMS = ((codecs.BOM_UTF32_LE, 'utf_32_le'), (codecs.BOM_UTF32_BE, 'utf_32_be'), (codecs.BOM_UTF16_LE, 'utf_16_le'), (codecs.BOM_UTF16_BE, 'utf_16_be'), (codecs.BOM_UTF8, 'utf_8')) CODECS_BOMS = dict((codec, str(bom, codec)) for bom, codec in BOMS) BIGGER_BOM = max(len(bom) for bom, encoding in BOMS) class SubRipFile(UserList, object): """ SubRip file descriptor. Provide a pure Python mapping on all metadata. SubRipFile(items, eol, path, encoding) items -> list of SubRipItem. Default to []. eol -> str: end of line character. Default to linesep used in opened file if any else to os.linesep. path -> str: path where file will be saved. To open an existant file see SubRipFile.open. encoding -> str: encoding used at file save. Default to utf-8. """ ERROR_PASS = 0 ERROR_LOG = 1 ERROR_RAISE = 2 DEFAULT_ENCODING = 'utf_8' def __init__(self, items=None, eol=None, path=None, encoding='utf-8'): UserList.__init__(self, items or []) self._eol = eol self.path = path self.encoding = encoding def _get_eol(self): return self._eol or os.linesep def _set_eol(self, eol): self._eol = self._eol or eol eol = property(_get_eol, _set_eol) def slice(self, starts_before=None, starts_after=None, ends_before=None, ends_after=None): """ slice([starts_before][, starts_after][, ends_before][, ends_after]) \ -> SubRipFile clone All arguments are optional, and should be coercible to SubRipTime object. It reduce the set of subtitles to those that match match given time constraints. The returned set is a clone, but still contains references to original subtitles. So if you shift this returned set, subs contained in the original SubRipFile instance will be altered too. Example: >>> subs.slice(ends_after={'seconds': 20}).shift(seconds=2) """ clone = copy(self) if starts_before: clone.data = (i for i in clone.data if i.start < starts_before) if starts_after: clone.data = (i for i in clone.data if i.start > starts_after) if ends_before: clone.data = (i for i in clone.data if i.end < ends_before) if ends_after: clone.data = (i for i in clone.data if i.end > ends_after) clone.data = list(clone.data) return clone def at(self, timestamp=None, **kwargs): """ at(timestamp) -> SubRipFile clone timestamp argument should be coercible to SubRipFile object. A specialization of slice. Return all subtiles visible at the timestamp mark. Example: >>> subs.at((0, 0, 20, 0)).shift(seconds=2) >>> subs.at(seconds=20).shift(seconds=2) """ time = timestamp or kwargs return self.slice(starts_before=time, ends_after=time) def shift(self, *args, **kwargs): """shift(hours, minutes, seconds, milliseconds, ratio) Shift `start` and `end` attributes of each items of file either by applying a ratio or by adding an offset. `ratio` should be either an int or a float. Example to convert subtitles from 23.9 fps to 25 fps: >>> subs.shift(ratio=25/23.9) All "time" arguments are optional and have a default value of 0. Example to delay all subs from 2 seconds and half >>> subs.shift(seconds=2, milliseconds=500) """ for item in self: item.shift(*args, **kwargs) def clean_indexes(self): """ clean_indexes() Sort subs and reset their index attribute. Should be called after destructive operations like split or such. """ self.sort() for index, item in enumerate(self): item.index = index + 1 @property def text(self): return '\n'.join(i.text for i in self) @classmethod def open(cls, path='', encoding=None, error_handling=ERROR_PASS): """ open([path, [encoding]]) If you do not provide any encoding, it can be detected if the file contain a bit order mark, unless it is set to utf-8 as default. """ new_file = cls(path=path, encoding=encoding) source_file = cls._open_unicode_file(path, claimed_encoding=encoding) new_file.read(source_file, error_handling=error_handling) source_file.close() return new_file @classmethod def from_string(cls, source, **kwargs): """ from_string(source, **kwargs) -> SubRipFile `source` -> a unicode instance or at least a str instance encoded with `sys.getdefaultencoding()` """ error_handling = kwargs.pop('error_handling', None) new_file = cls(**kwargs) new_file.read(source.splitlines(True), error_handling=error_handling) return new_file def read(self, source_file, error_handling=ERROR_PASS): """ read(source_file, [error_handling]) This method parse subtitles contained in `source_file` and append them to the current instance. `source_file` -> Any iterable that yield unicode strings, like a file opened with `codecs.open()` or an array of unicode. """ self.eol = self._guess_eol(source_file) self.extend(self.stream(source_file, error_handling=error_handling)) return self @classmethod def stream(cls, source_file, error_handling=ERROR_PASS): """ stream(source_file, [error_handling]) This method yield SubRipItem instances a soon as they have been parsed without storing them. It is a kind of SAX parser for .srt files. `source_file` -> Any iterable that yield unicode strings, like a file opened with `codecs.open()` or an array of unicode. Example: >>> import pysrt >>> import codecs >>> file = codecs.open('movie.srt', encoding='utf-8') >>> for sub in pysrt.stream(file): ... sub.text += "\nHello !" ... print unicode(sub) """ string_buffer = [] for index, line in enumerate(chain(source_file, '\n')): if line.strip(): string_buffer.append(line) else: source = string_buffer string_buffer = [] if source and all(source): try: yield SubRipItem.from_lines(source) except Error as error: error.args += (''.join(source), ) cls._handle_error(error, error_handling, index) def save(self, path=None, encoding=None, eol=None): """ save([path][, encoding][, eol]) Use initial path if no other provided. Use initial encoding if no other provided. Use initial eol if no other provided. """ path = path or self.path encoding = encoding or self.encoding save_file = codecs.open(path, 'w+', encoding=encoding) self.write_into(save_file, eol=eol) save_file.close() def write_into(self, output_file, eol=None): """ write_into(output_file [, eol]) Serialize current state into `output_file`. `output_file` -> Any instance that respond to `write()`, typically a file object """ output_eol = eol or self.eol for item in self: string_repr = str(item) if output_eol != '\n': string_repr = string_repr.replace('\n', output_eol) output_file.write(string_repr) # Only add trailing eol if it's not already present. # It was kept in the SubRipItem's text before but it really # belongs here. Existing applications might give us subtitles # which already contain a trailing eol though. if not string_repr.endswith(2 * output_eol): output_file.write(output_eol) @classmethod def _guess_eol(cls, string_iterable): first_line = cls._get_first_line(string_iterable) for eol in ('\r\n', '\r', '\n'): if first_line.endswith(eol): return eol return os.linesep @classmethod def _get_first_line(cls, string_iterable): if hasattr(string_iterable, 'tell'): previous_position = string_iterable.tell() try: first_line = next(iter(string_iterable)) except StopIteration: return '' if hasattr(string_iterable, 'seek'): string_iterable.seek(previous_position) return first_line @classmethod def _detect_encoding(cls, path): file_descriptor = open(path, 'rb') first_chars = file_descriptor.read(BIGGER_BOM) file_descriptor.close() for bom, encoding in BOMS: if first_chars.startswith(bom): return encoding # TODO: maybe a chardet integration return cls.DEFAULT_ENCODING @classmethod def _open_unicode_file(cls, path, claimed_encoding=None): encoding = claimed_encoding or cls._detect_encoding(path) source_file = codecs.open(path, 'rU', encoding=encoding) # get rid of BOM if any possible_bom = CODECS_BOMS.get(encoding, None) if possible_bom: file_bom = source_file.read(len(possible_bom)) if not file_bom == possible_bom: source_file.seek(0) # if not rewind return source_file @classmethod def _handle_error(cls, error, error_handling, index): if error_handling == cls.ERROR_RAISE: error.args = (index, ) + error.args raise error if error_handling == cls.ERROR_LOG: name = type(error).__name__ sys.stderr.write('PySRT-%s(line %s): \n' % (name, index)) sys.stderr.write(error.args[0].encode('ascii', 'replace')) sys.stderr.write('\n')