filebot/source/net/filebot/similarity/CommonSequenceMatcher.java

118 lines
3.2 KiB
Java

package net.filebot.similarity;
import static java.util.Arrays.*;
import static java.util.Collections.*;
import static net.filebot.util.RegularExpressions.*;
import java.text.CollationKey;
import java.text.Collator;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
public class CommonSequenceMatcher {
public static Collator getLenientCollator(Locale locale) {
// use maximum strength collator by default
Collator collator = Collator.getInstance(locale);
collator.setDecomposition(Collator.FULL_DECOMPOSITION);
collator.setStrength(Collator.PRIMARY);
return collator;
}
protected final Collator collator;
protected final int commonSequenceMaxStartIndex;
protected final boolean returnFirstMatch;
public CommonSequenceMatcher(Collator collator, int commonSequenceMaxStartIndex, boolean returnFirstMatch) {
this.collator = collator;
this.commonSequenceMaxStartIndex = commonSequenceMaxStartIndex;
this.returnFirstMatch = returnFirstMatch;
}
public Collator getCollator() {
return collator;
}
public String matchFirstCommonSequence(String... names) {
CollationKey[][] words = new CollationKey[names.length][];
for (int i = 0; i < names.length; i++) {
words[i] = split(names[i]);
}
return synth(matchFirstCommonSequence(words));
}
public <E extends Comparable<E>> E[] matchFirstCommonSequence(E[][] names) {
E[] common = null;
for (E[] words : names) {
if (common == null) {
// initialize common with current word array
common = words;
} else {
// find common sequence
common = firstCommonSequence(common, words, commonSequenceMaxStartIndex, returnFirstMatch);
if (common == null) {
// no common sequence
return null;
}
}
}
return common;
}
protected String synth(CollationKey[] keys) {
if (keys == null) {
return null;
}
StringBuilder sb = new StringBuilder();
for (CollationKey it : keys) {
if (sb.length() > 0) {
sb.append(' ');
}
sb.append(it.getSourceString());
}
return sb.toString();
}
public CollationKey[] split(String sequence) {
return getCollationKeys(SPACE.split(sequence));
}
private final Map<String, CollationKey> collationKeyDictionary = synchronizedMap(new HashMap<String, CollationKey>(64, 4));
protected CollationKey[] getCollationKeys(String[] words) {
return stream(words).map(w -> {
return collationKeyDictionary.computeIfAbsent(w, collator::getCollationKey);
}).toArray(CollationKey[]::new);
}
protected <E extends Comparable<E>> E[] firstCommonSequence(E[] seq1, E[] seq2, int maxStartIndex, boolean returnFirstMatch) {
E[] matchSeq = null;
for (int i = 0; i < seq1.length && i <= maxStartIndex; i++) {
for (int j = 0; j < seq2.length && j <= maxStartIndex; j++) {
// common sequence length
int len = 0;
// iterate over common sequence
while ((i + len < seq1.length) && (j + len < seq2.length) && (seq1[i + len].compareTo(seq2[j + len]) == 0)) {
len++;
}
// check if a common sequence was found
if (len > (matchSeq == null ? 0 : matchSeq.length)) {
matchSeq = copyOfRange(seq1, i, i + len);
// look for first match
if (returnFirstMatch) {
return matchSeq;
}
}
}
}
return matchSeq;
}
}