diff --git a/ivy.xml b/ivy.xml index fc724d52..6471e322 100644 --- a/ivy.xml +++ b/ivy.xml @@ -7,6 +7,7 @@ + diff --git a/lib/jar.includes b/lib/jar.includes index 9e20cd3c..3fbee022 100644 --- a/lib/jar.includes +++ b/lib/jar.includes @@ -15,6 +15,7 @@ ivy/jar/jaxb-impl.jar ivy/jar/junrar.jar ivy/jar/jna.jar ivy/jar/jna-platform.jar +ivy/jar/simmetrics-core.jar ivy/jar/streamex.jar ivy/jar/icu4j.jar ivy/jar/language-detector.jar @@ -53,5 +54,4 @@ ivy/bundle/json-io.jar ivy/bundle/guava.jar jars/xmlrpc.jar jars/ObjCBridge.jar -jars/simmetrics.jar jars/jacksum.jar \ No newline at end of file diff --git a/lib/jars/simmetrics.jar b/lib/jars/simmetrics.jar deleted file mode 100644 index 17bf7118..00000000 Binary files a/lib/jars/simmetrics.jar and /dev/null differ diff --git a/source/net/filebot/similarity/NameSimilarityMetric.java b/source/net/filebot/similarity/NameSimilarityMetric.java index f744306f..92111508 100644 --- a/source/net/filebot/similarity/NameSimilarityMetric.java +++ b/source/net/filebot/similarity/NameSimilarityMetric.java @@ -1,40 +1,26 @@ package net.filebot.similarity; - import static net.filebot.similarity.Normalization.*; +import static org.simmetrics.builders.StringMetricBuilder.*; +import static org.simmetrics.tokenizers.Tokenizers.*; + +import org.simmetrics.StringMetric; +import org.simmetrics.metrics.BlockDistance; import com.ibm.icu.text.Transliterator; -import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; -import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance; -import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserQGram3; - - public class NameSimilarityMetric implements SimilarityMetric { - private final AbstractStringMetric metric; - private final Transliterator transliterator; - - - public NameSimilarityMetric() { - // QGramsDistance with a QGram tokenizer seems to work best for similarity of names - this(new QGramsDistance(new TokeniserQGram3()), Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove")); - } - - - public NameSimilarityMetric(AbstractStringMetric metric, Transliterator transliterator) { - this.metric = metric; - this.transliterator = transliterator; - } + private final StringMetric metric = with(new BlockDistance()).tokenize(qGramWithPadding(3)).build(); + private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"); @Override public float getSimilarity(Object o1, Object o2) { - return metric.getSimilarity(normalize(o1), normalize(o2)); + return metric.compare(normalize(o1), normalize(o2)); } - protected String normalize(Object object) { // use string representation String name = object.toString(); diff --git a/source/net/filebot/similarity/NumericSimilarityMetric.java b/source/net/filebot/similarity/NumericSimilarityMetric.java index 9d5c6a9c..50982719 100644 --- a/source/net/filebot/similarity/NumericSimilarityMetric.java +++ b/source/net/filebot/similarity/NumericSimilarityMetric.java @@ -1,32 +1,22 @@ package net.filebot.similarity; import static java.util.stream.Collectors.*; -import static net.filebot.util.RegularExpressions.*; import static net.filebot.util.StringUtilities.*; +import static org.simmetrics.builders.StringMetricBuilder.*; -import java.util.ArrayList; -import java.util.LinkedHashSet; -import java.util.Set; +import java.util.List; -import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; -import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance; -import uk.ac.shef.wit.simmetrics.tokenisers.InterfaceTokeniser; -import uk.ac.shef.wit.simmetrics.wordhandlers.DummyStopTermHandler; -import uk.ac.shef.wit.simmetrics.wordhandlers.InterfaceTermHandler; +import org.simmetrics.StringMetric; +import org.simmetrics.metrics.BlockDistance; +import org.simmetrics.tokenizers.AbstractTokenizer; public class NumericSimilarityMetric implements SimilarityMetric { - private final AbstractStringMetric metric; - - public NumericSimilarityMetric() { - // I don't exactly know why, but I get a good matching behavior - // when using QGramsDistance or BlockDistance - metric = new QGramsDistance(new NumberTokeniser()); - } + private final StringMetric metric = with(new BlockDistance()).tokenize(new NumberTokeniser()).build(); @Override public float getSimilarity(Object o1, Object o2) { - return metric.getSimilarity(normalize(o1), normalize(o2)); + return metric.compare(normalize(o1), normalize(o2)); } protected String normalize(Object object) { @@ -34,40 +24,12 @@ public class NumericSimilarityMetric implements SimilarityMetric { return object.toString(); } - private static class NumberTokeniser implements InterfaceTokeniser { + private static class NumberTokeniser extends AbstractTokenizer { @Override - public ArrayList tokenizeToArrayList(String s) { - return matchIntegers(s).stream().map(String::valueOf).collect(toCollection(ArrayList::new)); + public List tokenizeToList(String input) { + return matchIntegers(input).stream().map(String::valueOf).collect(toList()); } - - @Override - public String getDelimiters() { - return NON_DIGIT.pattern(); - } - - @Override - public Set tokenizeToSet(String input) { - return new LinkedHashSet(tokenizeToArrayList(input)); - } - - @Override - public String getShortDescriptionString() { - return getClass().getSimpleName(); - } - - private InterfaceTermHandler stopWordHandler = new DummyStopTermHandler(); - - @Override - public InterfaceTermHandler getStopWordHandler() { - return stopWordHandler; - } - - @Override - public void setStopWordHandler(InterfaceTermHandler stopWordHandler) { - this.stopWordHandler = stopWordHandler; - } - } } diff --git a/source/net/filebot/web/LocalSearch.java b/source/net/filebot/web/LocalSearch.java index f24b200b..0d9d22dd 100644 --- a/source/net/filebot/web/LocalSearch.java +++ b/source/net/filebot/web/LocalSearch.java @@ -5,6 +5,8 @@ import static java.util.Collections.reverseOrder; import static java.util.Comparator.*; import static java.util.stream.Collectors.*; import static net.filebot.similarity.Normalization.*; +import static org.simmetrics.builders.StringMetricBuilder.*; +import static org.simmetrics.tokenizers.Tokenizers.*; import java.util.AbstractMap.SimpleImmutableEntry; import java.util.Collection; @@ -16,21 +18,22 @@ import java.util.concurrent.ExecutionException; import java.util.function.Function; import java.util.stream.IntStream; -import com.ibm.icu.text.Transliterator; +import org.simmetrics.StringMetric; +import org.simmetrics.metrics.BlockDistance; -import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; -import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance; +import com.ibm.icu.text.Transliterator; public class LocalSearch { - private AbstractStringMetric metric = new QGramsDistance(); - private float resultMinimumSimilarity = 0.5f; - private int resultSetSize = 20; + private final StringMetric metric = with(new BlockDistance()).tokenize(qGramWithPadding(3)).build(); - private Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"); + private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"); - private T[] objects; - private Set[] fields; + private final float resultMinimumSimilarity = 0.5f; + private final int resultSetSize = 20; + + private final T[] objects; + private final Set[] fields; public LocalSearch(T[] data, Function> keywords) { objects = data.clone(); @@ -45,20 +48,12 @@ public class LocalSearch { Set field = fields[i]; boolean match = field.stream().anyMatch(it -> it.contains(query)); - double similarity = field.stream().mapToDouble(it -> metric.getSimilarity(query, it)).max().orElse(0); + double similarity = field.stream().mapToDouble(it -> metric.compare(query, it)).max().orElse(0); return match || similarity > resultMinimumSimilarity ? new SimpleImmutableEntry(object, similarity) : null; }).filter(Objects::nonNull).sorted(reverseOrder(comparing(Entry::getValue))).limit(resultSetSize).map(Entry::getKey).collect(toList()); } - public void setResultMinimumSimilarity(float resultMinimumSimilarity) { - this.resultMinimumSimilarity = resultMinimumSimilarity; - } - - public void setResultSetSize(int resultSetSize) { - this.resultSetSize = resultSetSize; - } - protected Set normalize(Collection values) { return values.stream().map(this::normalize).collect(toSet()); }