filebot/source/net/filebot/similarity/SeriesNameMatcher.java

432 lines
12 KiB
Java

package net.filebot.similarity;
import static java.util.Collections.*;
import static java.util.regex.Pattern.*;
import static net.filebot.similarity.CommonSequenceMatcher.*;
import static net.filebot.similarity.Normalization.*;
import static net.filebot.util.StringUtilities.*;
import java.io.File;
import java.text.CollationKey;
import java.text.Collator;
import java.util.AbstractCollection;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.filebot.media.SmartSeasonEpisodeMatcher;
import net.filebot.similarity.SeasonEpisodeMatcher.SxE;
import net.filebot.util.FileUtilities;
public class SeriesNameMatcher {
protected final SimilarityMetric metric;
protected final SeasonEpisodeMatcher seasonEpisodeMatcher;
protected final DateMatcher dateMatcher;
protected final CommonSequenceMatcher commonSequenceMatcher;
public SeriesNameMatcher(boolean strict) {
this(new NameSimilarityMetric(), getLenientCollator(Locale.ENGLISH), new SmartSeasonEpisodeMatcher(SeasonEpisodeMatcher.DEFAULT_SANITY, strict), new DateMatcher(DateMatcher.DEFAULT_SANITY, Locale.ENGLISH));
}
public SeriesNameMatcher(SeasonEpisodeMatcher seasonEpisodeMatcher, DateMatcher dateMatcher) {
this(new NameSimilarityMetric(), getLenientCollator(Locale.ENGLISH), seasonEpisodeMatcher, dateMatcher);
}
public SeriesNameMatcher(SimilarityMetric metric, Collator collator, SeasonEpisodeMatcher seasonEpisodeMatcher, DateMatcher dateMatcher) {
this.metric = metric;
this.seasonEpisodeMatcher = seasonEpisodeMatcher;
this.dateMatcher = dateMatcher;
this.commonSequenceMatcher = new CommonSequenceMatcher(collator, 3, true) {
@Override
public CollationKey[] split(String sequence) {
return super.split(normalize(sequence));
}
};
}
public Collection<String> matchAll(File[] files) {
SeriesNameCollection seriesNames = new SeriesNameCollection();
// group files by parent folder
for (Entry<File, String[]> entry : mapNamesByFolder(files).entrySet()) {
String parent = entry.getKey().getName();
String[] names = entry.getValue();
for (String nameMatch : matchAll(names)) {
String commonMatch = commonSequenceMatcher.matchFirstCommonSequence(nameMatch, parent);
float similarity = commonMatch == null ? 0 : metric.getSimilarity(commonMatch, nameMatch);
// prefer common match, but only if it's very similar to the original match
seriesNames.add(similarity > 0.7 ? commonMatch : nameMatch);
}
}
return seriesNames;
}
public Collection<String> matchAll(String[] names) {
SeriesNameCollection seriesNames = new SeriesNameCollection();
// allow matching of a small number of episodes, by setting threshold = length if length < 5
int threshold = Math.min(names.length, 5);
// match common word sequences (likely series names)
SeriesNameCollection whitelist = new SeriesNameCollection();
// focus chars before the SxE / Date pattern when matching by common word sequence
String[] focus = Arrays.copyOf(names, names.length);
for (int i = 0; i < focus.length; i++) {
String beforeSxE = seasonEpisodeMatcher.head(focus[i]);
if (beforeSxE != null && beforeSxE.length() > 0) {
focus[i] = beforeSxE;
} else {
int datePos = dateMatcher.find(focus[i], 0);
if (datePos >= 0) {
focus[i] = focus[i].substring(0, datePos);
}
}
}
whitelist.addAll(deepMatchAll(focus, threshold));
// 1. use pattern matching
seriesNames.addAll(flatMatchAll(names, compile(join(whitelist, "|"), CASE_INSENSITIVE | UNICODE_CHARACTER_CLASS), threshold, false));
// 2. use common word sequences
seriesNames.addAll(whitelist);
return seriesNames;
}
/**
* Try to match and verify all series names using known season episode patterns.
*
* @param names
* episode names
* @return series names that have been matched one or multiple times depending on the threshold
*/
private Collection<String> flatMatchAll(String[] names, Pattern prefixPattern, int threshold, boolean strict) {
Comparator<String> wordComparator = (Comparator) commonSequenceMatcher.getCollator();
ThresholdCollection<String> thresholdCollection = new ThresholdCollection<String>(threshold, wordComparator);
for (String name : names) {
// use normalized name
name = normalize(name);
Matcher prefix = prefixPattern.matcher(name);
int prefixEnd = prefix.find() ? prefix.end() : 0;
int sxePosition = seasonEpisodeMatcher.find(name, prefixEnd);
if (sxePosition > 0) {
String hit = name.substring(0, sxePosition).trim();
List<SxE> sxe = seasonEpisodeMatcher.match(name.substring(sxePosition));
if (!strict && sxe.size() == 1 && sxe.get(0).season >= 0) {
// bypass threshold if hit is likely to be genuine
thresholdCollection.addDirect(hit);
} else {
// require multiple matches, if hit might be a false match
thresholdCollection.add(hit);
}
} else {
// try date pattern as fallback
int datePosition = dateMatcher.find(name, prefixEnd);
if (datePosition > 0) {
thresholdCollection.addDirect(name.substring(0, datePosition).trim());
}
}
}
return thresholdCollection;
}
/**
* Try to match all common word sequences in the given list.
*
* @param names
* list of episode names
* @return all common word sequences that have been found
*/
private Collection<String> deepMatchAll(String[] names, int threshold) {
// can't use common word sequence matching for less than 2 names
if (names.length < 2 || names.length < threshold) {
return emptySet();
}
String common = commonSequenceMatcher.matchFirstCommonSequence(names);
if (common != null) {
// common word sequence found
return singleton(common);
}
// recursive divide and conquer
List<String> results = new ArrayList<String>();
// split list in two and try to match common word sequence on those
results.addAll(deepMatchAll(Arrays.copyOfRange(names, 0, names.length / 2), threshold));
results.addAll(deepMatchAll(Arrays.copyOfRange(names, names.length / 2, names.length), threshold));
return results;
}
/**
* Try to match a series name from the given episode name using known season episode patterns.
*
* @param name
* episode name
* @return a substring of the given name that ends before the first occurrence of a season episode pattern, or null if there is no such pattern
*/
public String matchByEpisodeIdentifier(String name) {
// series name ends at the first season episode pattern
String seriesName = seasonEpisodeMatcher.head(name);
if (seriesName != null && seriesName.length() > 0) {
return seriesName;
}
int datePosition = dateMatcher.find(name, 0);
if (datePosition > 0) {
// series name ends at the first season episode pattern
return name.substring(0, datePosition);
}
return null;
}
public String matchBySeparator(String name) {
Pattern separator = Pattern.compile("[\\s]+[-]+[\\s]+", Pattern.UNICODE_CHARACTER_CLASS);
Matcher matcher = separator.matcher(name);
if (matcher.find() && matcher.start() > 0) {
return normalizePunctuation(name.substring(0, matcher.start()));
}
return null;
}
/**
* Try to match a series name from the first common word sequence.
*
* @param names
* various episode names (at least two)
* @return a word sequence all episode names have in common, or null
* @throws IllegalArgumentException
* if less than 2 episode names are given
*/
public String matchByFirstCommonWordSequence(String... names) {
if (names.length < 2) {
throw new IllegalArgumentException("Can't match common sequence from less than two names");
}
return commonSequenceMatcher.matchFirstCommonSequence(names);
}
protected String normalize(String name) {
// remove group names and checksums, any [...] or (...) and remove/normalize special characters
return normalizePunctuation(normalizeBrackets(name));
}
protected <T> T[] firstCommonSequence(T[] seq1, T[] seq2, int maxStartIndex, Comparator<T> equalsComparator) {
for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) {
for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) {
// common sequence length
int len = 0;
// iterate over common sequence
while ((i + len < seq1.length) && (j + len < seq2.length) && (equalsComparator.compare(seq1[i + len], seq2[j + len]) == 0)) {
len++;
}
// check if a common sequence was found
if (len > 0) {
if (i == 0 && len == seq1.length)
return seq1;
return Arrays.copyOfRange(seq1, i, i + len);
}
}
}
// no intersection at all
return null;
}
private Map<File, String[]> mapNamesByFolder(File... files) {
Map<File, List<File>> filesByFolder = new LinkedHashMap<File, List<File>>();
for (File file : files) {
File folder = file.getParentFile();
List<File> list = filesByFolder.get(folder);
if (list == null) {
list = new ArrayList<File>();
filesByFolder.put(folder, list);
}
list.add(file);
}
// convert folder->files map to folder->names map
Map<File, String[]> namesByFolder = new LinkedHashMap<File, String[]>();
for (Entry<File, List<File>> entry : filesByFolder.entrySet()) {
namesByFolder.put(entry.getKey(), names(entry.getValue()));
}
return namesByFolder;
}
protected String[] names(Collection<File> files) {
String[] names = new String[files.size()];
int i = 0;
// fill array
for (File file : files) {
names[i++] = FileUtilities.getName(file);
}
return names;
}
protected static class SeriesNameCollection extends AbstractCollection<String> {
private final Map<String, String> data = new LinkedHashMap<String, String>();
@Override
public boolean add(String value) {
value = value.trim();
// require series name to have at least two characters
if (value.length() < 2) {
return false;
}
String current = data.get(key(value));
// prefer strings with similar upper/lower case ratio (e.g. prefer Roswell over roswell)
if (current == null || firstCharacterCaseBalance(current) < firstCharacterCaseBalance(value)) {
data.put(key(value), value);
return true;
}
return false;
}
protected String key(Object value) {
return value.toString().toLowerCase();
}
protected float firstCharacterCaseBalance(String s) {
int upper = 0;
int lower = 0;
Scanner scanner = new Scanner(s); // Scanner uses a white space delimiter by default
while (scanner.hasNext()) {
char c = scanner.next().charAt(0);
if (Character.isLowerCase(c))
lower++;
else if (Character.isUpperCase(c))
upper++;
}
// give upper case characters a slight boost over lower case characters
return (lower + (upper * 1.01f)) / Math.abs(lower - upper);
}
@Override
public boolean contains(Object value) {
return data.containsKey(key(value));
}
@Override
public Iterator<String> iterator() {
return data.values().iterator();
}
@Override
public int size() {
return data.size();
}
}
protected static class ThresholdCollection<E> extends AbstractCollection<E> {
private final Collection<E> heaven;
private final Map<E, Collection<E>> limbo;
private final int threshold;
public ThresholdCollection(int threshold, Comparator<E> equalityComparator) {
this.heaven = new ArrayList<E>();
this.limbo = new TreeMap<E, Collection<E>>(equalityComparator);
this.threshold = threshold;
}
@Override
public boolean add(E value) {
Collection<E> buffer = limbo.get(value);
if (buffer == null) {
// initialize buffer
buffer = new ArrayList<E>(threshold);
limbo.put(value, buffer);
}
if (buffer == heaven) {
// threshold reached
heaven.add(value);
return true;
}
// add element to buffer
buffer.add(value);
// check if threshold has been reached
if (buffer.size() >= threshold) {
heaven.addAll(buffer);
// replace buffer with heaven
limbo.put(value, heaven);
return true;
}
return false;
};
public boolean addDirect(E element) {
return heaven.add(element);
}
@Override
public Iterator<E> iterator() {
return heaven.iterator();
}
@Override
public int size() {
return heaven.size();
}
}
}