mirror of
https://github.com/mitb-archive/filebot
synced 2024-11-16 14:25:02 -05:00
124 lines
3.3 KiB
Java
124 lines
3.3 KiB
Java
package net.filebot.similarity;
|
|
|
|
import static java.util.Arrays.*;
|
|
import static java.util.Collections.*;
|
|
import static net.filebot.util.RegularExpressions.*;
|
|
|
|
import java.text.CollationKey;
|
|
import java.text.Collator;
|
|
import java.util.HashMap;
|
|
import java.util.Locale;
|
|
import java.util.Map;
|
|
|
|
public class CommonSequenceMatcher {
|
|
|
|
public static Collator getLenientCollator(Locale locale) {
|
|
// use maximum strength collator by default
|
|
Collator collator = Collator.getInstance(locale);
|
|
collator.setDecomposition(Collator.FULL_DECOMPOSITION);
|
|
collator.setStrength(Collator.PRIMARY);
|
|
return collator;
|
|
}
|
|
|
|
protected final Collator collator;
|
|
protected final int commonSequenceMaxStartIndex;
|
|
protected final boolean returnFirstMatch;
|
|
|
|
public CommonSequenceMatcher(Collator collator, int commonSequenceMaxStartIndex, boolean returnFirstMatch) {
|
|
this.collator = collator;
|
|
this.commonSequenceMaxStartIndex = commonSequenceMaxStartIndex;
|
|
this.returnFirstMatch = returnFirstMatch;
|
|
}
|
|
|
|
public Collator getCollator() {
|
|
return collator;
|
|
}
|
|
|
|
public String matchFirstCommonSequence(String... names) {
|
|
CollationKey[][] words = new CollationKey[names.length][];
|
|
for (int i = 0; i < names.length; i++) {
|
|
words[i] = split(names[i]);
|
|
}
|
|
return synth(matchFirstCommonSequence(words));
|
|
}
|
|
|
|
public <E extends Comparable<E>> E[] matchFirstCommonSequence(E[][] names) {
|
|
E[] common = null;
|
|
|
|
for (E[] words : names) {
|
|
if (common == null) {
|
|
// initialize common with current word array
|
|
common = words;
|
|
} else {
|
|
// find common sequence
|
|
common = firstCommonSequence(common, words, commonSequenceMaxStartIndex, returnFirstMatch);
|
|
|
|
if (common == null) {
|
|
// no common sequence
|
|
return null;
|
|
}
|
|
}
|
|
}
|
|
return common;
|
|
}
|
|
|
|
protected String synth(CollationKey[] keys) {
|
|
if (keys == null) {
|
|
return null;
|
|
}
|
|
|
|
StringBuilder sb = new StringBuilder();
|
|
for (CollationKey it : keys) {
|
|
if (sb.length() > 0) {
|
|
sb.append(' ');
|
|
}
|
|
sb.append(it.getSourceString());
|
|
}
|
|
return sb.toString();
|
|
}
|
|
|
|
public CollationKey[] split(String sequence) {
|
|
return getCollationKeys(SPACE.split(sequence));
|
|
}
|
|
|
|
private final Map<String, CollationKey> collationKeyDictionary = synchronizedMap(new HashMap<String, CollationKey>(256));
|
|
|
|
protected CollationKey[] getCollationKeys(String[] words) {
|
|
CollationKey[] keys = new CollationKey[words.length];
|
|
for (int i = 0; i < keys.length; i++) {
|
|
keys[i] = collationKeyDictionary.get(words[i]);
|
|
if (keys[i] == null) {
|
|
keys[i] = collator.getCollationKey(words[i]);
|
|
collationKeyDictionary.put(words[i], keys[i]);
|
|
}
|
|
}
|
|
return keys;
|
|
}
|
|
|
|
protected <E extends Comparable<E>> E[] firstCommonSequence(E[] seq1, E[] seq2, int maxStartIndex, boolean returnFirstMatch) {
|
|
E[] matchSeq = null;
|
|
for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) {
|
|
for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) {
|
|
// common sequence length
|
|
int len = 0;
|
|
|
|
// iterate over common sequence
|
|
while ((i + len < seq1.length) && (j + len < seq2.length) && (seq1[i + len].compareTo(seq2[j + len]) == 0)) {
|
|
len++;
|
|
}
|
|
|
|
// check if a common sequence was found
|
|
if (len > (matchSeq == null ? 0 : matchSeq.length)) {
|
|
matchSeq = copyOfRange(seq1, i, i + len);
|
|
|
|
// look for first match
|
|
if (returnFirstMatch) {
|
|
return matchSeq;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return matchSeq;
|
|
}
|
|
}
|