diff --git a/ivy.xml b/ivy.xml
index fc724d52..6471e322 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -7,6 +7,7 @@
+
diff --git a/lib/jar.includes b/lib/jar.includes
index 9e20cd3c..3fbee022 100644
--- a/lib/jar.includes
+++ b/lib/jar.includes
@@ -15,6 +15,7 @@ ivy/jar/jaxb-impl.jar
ivy/jar/junrar.jar
ivy/jar/jna.jar
ivy/jar/jna-platform.jar
+ivy/jar/simmetrics-core.jar
ivy/jar/streamex.jar
ivy/jar/icu4j.jar
ivy/jar/language-detector.jar
@@ -53,5 +54,4 @@ ivy/bundle/json-io.jar
ivy/bundle/guava.jar
jars/xmlrpc.jar
jars/ObjCBridge.jar
-jars/simmetrics.jar
jars/jacksum.jar
\ No newline at end of file
diff --git a/lib/jars/simmetrics.jar b/lib/jars/simmetrics.jar
deleted file mode 100644
index 17bf7118..00000000
Binary files a/lib/jars/simmetrics.jar and /dev/null differ
diff --git a/source/net/filebot/similarity/NameSimilarityMetric.java b/source/net/filebot/similarity/NameSimilarityMetric.java
index f744306f..92111508 100644
--- a/source/net/filebot/similarity/NameSimilarityMetric.java
+++ b/source/net/filebot/similarity/NameSimilarityMetric.java
@@ -1,40 +1,26 @@
package net.filebot.similarity;
-
import static net.filebot.similarity.Normalization.*;
+import static org.simmetrics.builders.StringMetricBuilder.*;
+import static org.simmetrics.tokenizers.Tokenizers.*;
+
+import org.simmetrics.StringMetric;
+import org.simmetrics.metrics.BlockDistance;
import com.ibm.icu.text.Transliterator;
-import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
-import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
-import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserQGram3;
-
-
public class NameSimilarityMetric implements SimilarityMetric {
- private final AbstractStringMetric metric;
- private final Transliterator transliterator;
-
-
- public NameSimilarityMetric() {
- // QGramsDistance with a QGram tokenizer seems to work best for similarity of names
- this(new QGramsDistance(new TokeniserQGram3()), Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"));
- }
-
-
- public NameSimilarityMetric(AbstractStringMetric metric, Transliterator transliterator) {
- this.metric = metric;
- this.transliterator = transliterator;
- }
+ private final StringMetric metric = with(new BlockDistance()).tokenize(qGramWithPadding(3)).build();
+ private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
@Override
public float getSimilarity(Object o1, Object o2) {
- return metric.getSimilarity(normalize(o1), normalize(o2));
+ return metric.compare(normalize(o1), normalize(o2));
}
-
protected String normalize(Object object) {
// use string representation
String name = object.toString();
diff --git a/source/net/filebot/similarity/NumericSimilarityMetric.java b/source/net/filebot/similarity/NumericSimilarityMetric.java
index 9d5c6a9c..50982719 100644
--- a/source/net/filebot/similarity/NumericSimilarityMetric.java
+++ b/source/net/filebot/similarity/NumericSimilarityMetric.java
@@ -1,32 +1,22 @@
package net.filebot.similarity;
import static java.util.stream.Collectors.*;
-import static net.filebot.util.RegularExpressions.*;
import static net.filebot.util.StringUtilities.*;
+import static org.simmetrics.builders.StringMetricBuilder.*;
-import java.util.ArrayList;
-import java.util.LinkedHashSet;
-import java.util.Set;
+import java.util.List;
-import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
-import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
-import uk.ac.shef.wit.simmetrics.tokenisers.InterfaceTokeniser;
-import uk.ac.shef.wit.simmetrics.wordhandlers.DummyStopTermHandler;
-import uk.ac.shef.wit.simmetrics.wordhandlers.InterfaceTermHandler;
+import org.simmetrics.StringMetric;
+import org.simmetrics.metrics.BlockDistance;
+import org.simmetrics.tokenizers.AbstractTokenizer;
public class NumericSimilarityMetric implements SimilarityMetric {
- private final AbstractStringMetric metric;
-
- public NumericSimilarityMetric() {
- // I don't exactly know why, but I get a good matching behavior
- // when using QGramsDistance or BlockDistance
- metric = new QGramsDistance(new NumberTokeniser());
- }
+ private final StringMetric metric = with(new BlockDistance()).tokenize(new NumberTokeniser()).build();
@Override
public float getSimilarity(Object o1, Object o2) {
- return metric.getSimilarity(normalize(o1), normalize(o2));
+ return metric.compare(normalize(o1), normalize(o2));
}
protected String normalize(Object object) {
@@ -34,40 +24,12 @@ public class NumericSimilarityMetric implements SimilarityMetric {
return object.toString();
}
- private static class NumberTokeniser implements InterfaceTokeniser {
+ private static class NumberTokeniser extends AbstractTokenizer {
@Override
- public ArrayList tokenizeToArrayList(String s) {
- return matchIntegers(s).stream().map(String::valueOf).collect(toCollection(ArrayList::new));
+ public List tokenizeToList(String input) {
+ return matchIntegers(input).stream().map(String::valueOf).collect(toList());
}
-
- @Override
- public String getDelimiters() {
- return NON_DIGIT.pattern();
- }
-
- @Override
- public Set tokenizeToSet(String input) {
- return new LinkedHashSet(tokenizeToArrayList(input));
- }
-
- @Override
- public String getShortDescriptionString() {
- return getClass().getSimpleName();
- }
-
- private InterfaceTermHandler stopWordHandler = new DummyStopTermHandler();
-
- @Override
- public InterfaceTermHandler getStopWordHandler() {
- return stopWordHandler;
- }
-
- @Override
- public void setStopWordHandler(InterfaceTermHandler stopWordHandler) {
- this.stopWordHandler = stopWordHandler;
- }
-
}
}
diff --git a/source/net/filebot/web/LocalSearch.java b/source/net/filebot/web/LocalSearch.java
index f24b200b..0d9d22dd 100644
--- a/source/net/filebot/web/LocalSearch.java
+++ b/source/net/filebot/web/LocalSearch.java
@@ -5,6 +5,8 @@ import static java.util.Collections.reverseOrder;
import static java.util.Comparator.*;
import static java.util.stream.Collectors.*;
import static net.filebot.similarity.Normalization.*;
+import static org.simmetrics.builders.StringMetricBuilder.*;
+import static org.simmetrics.tokenizers.Tokenizers.*;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.Collection;
@@ -16,21 +18,22 @@ import java.util.concurrent.ExecutionException;
import java.util.function.Function;
import java.util.stream.IntStream;
-import com.ibm.icu.text.Transliterator;
+import org.simmetrics.StringMetric;
+import org.simmetrics.metrics.BlockDistance;
-import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
-import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
+import com.ibm.icu.text.Transliterator;
public class LocalSearch {
- private AbstractStringMetric metric = new QGramsDistance();
- private float resultMinimumSimilarity = 0.5f;
- private int resultSetSize = 20;
+ private final StringMetric metric = with(new BlockDistance()).tokenize(qGramWithPadding(3)).build();
- private Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
+ private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
- private T[] objects;
- private Set[] fields;
+ private final float resultMinimumSimilarity = 0.5f;
+ private final int resultSetSize = 20;
+
+ private final T[] objects;
+ private final Set[] fields;
public LocalSearch(T[] data, Function> keywords) {
objects = data.clone();
@@ -45,20 +48,12 @@ public class LocalSearch {
Set field = fields[i];
boolean match = field.stream().anyMatch(it -> it.contains(query));
- double similarity = field.stream().mapToDouble(it -> metric.getSimilarity(query, it)).max().orElse(0);
+ double similarity = field.stream().mapToDouble(it -> metric.compare(query, it)).max().orElse(0);
return match || similarity > resultMinimumSimilarity ? new SimpleImmutableEntry(object, similarity) : null;
}).filter(Objects::nonNull).sorted(reverseOrder(comparing(Entry::getValue))).limit(resultSetSize).map(Entry::getKey).collect(toList());
}
- public void setResultMinimumSimilarity(float resultMinimumSimilarity) {
- this.resultMinimumSimilarity = resultMinimumSimilarity;
- }
-
- public void setResultSetSize(int resultSetSize) {
- this.resultSetSize = resultSetSize;
- }
-
protected Set normalize(Collection values) {
return values.stream().map(this::normalize).collect(toSet());
}