Refactor string similarity code

This commit is contained in:
Reinhard Pointner 2019-01-30 23:30:59 +07:00
parent dfa4f78448
commit 9a228e6927
6 changed files with 33 additions and 89 deletions

View File

@ -7,6 +7,7 @@
<dependency rev="4.5.2" org="net.java.dev.jna" name="jna-platform" />
<dependency rev="2.33" org="args4j" name="args4j" />
<dependency rev="2.10.5" org="net.sf.ehcache" name="ehcache" />
<dependency rev="4.1.1" org="com.github.mpkorstanje" name="simmetrics-core" />
<dependency rev="61.1" org="com.ibm.icu" name="icu4j" />
<dependency rev="1.11.3" org="org.jsoup" name="jsoup" />
<dependency rev="1.8" org="org.tukaani" name="xz" />

View File

@ -15,6 +15,7 @@ ivy/jar/jaxb-impl.jar
ivy/jar/junrar.jar
ivy/jar/jna.jar
ivy/jar/jna-platform.jar
ivy/jar/simmetrics-core.jar
ivy/jar/streamex.jar
ivy/jar/icu4j.jar
ivy/jar/language-detector.jar
@ -53,5 +54,4 @@ ivy/bundle/json-io.jar
ivy/bundle/guava.jar
jars/xmlrpc.jar
jars/ObjCBridge.jar
jars/simmetrics.jar
jars/jacksum.jar

Binary file not shown.

View File

@ -1,40 +1,26 @@
package net.filebot.similarity;
import static net.filebot.similarity.Normalization.*;
import static org.simmetrics.builders.StringMetricBuilder.*;
import static org.simmetrics.tokenizers.Tokenizers.*;
import org.simmetrics.StringMetric;
import org.simmetrics.metrics.BlockDistance;
import com.ibm.icu.text.Transliterator;
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserQGram3;
public class NameSimilarityMetric implements SimilarityMetric {
private final AbstractStringMetric metric;
private final Transliterator transliterator;
public NameSimilarityMetric() {
// QGramsDistance with a QGram tokenizer seems to work best for similarity of names
this(new QGramsDistance(new TokeniserQGram3()), Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"));
}
public NameSimilarityMetric(AbstractStringMetric metric, Transliterator transliterator) {
this.metric = metric;
this.transliterator = transliterator;
}
private final StringMetric metric = with(new BlockDistance<String>()).tokenize(qGramWithPadding(3)).build();
private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
@Override
public float getSimilarity(Object o1, Object o2) {
return metric.getSimilarity(normalize(o1), normalize(o2));
return metric.compare(normalize(o1), normalize(o2));
}
protected String normalize(Object object) {
// use string representation
String name = object.toString();

View File

@ -1,32 +1,22 @@
package net.filebot.similarity;
import static java.util.stream.Collectors.*;
import static net.filebot.util.RegularExpressions.*;
import static net.filebot.util.StringUtilities.*;
import static org.simmetrics.builders.StringMetricBuilder.*;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.List;
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
import uk.ac.shef.wit.simmetrics.tokenisers.InterfaceTokeniser;
import uk.ac.shef.wit.simmetrics.wordhandlers.DummyStopTermHandler;
import uk.ac.shef.wit.simmetrics.wordhandlers.InterfaceTermHandler;
import org.simmetrics.StringMetric;
import org.simmetrics.metrics.BlockDistance;
import org.simmetrics.tokenizers.AbstractTokenizer;
public class NumericSimilarityMetric implements SimilarityMetric {
private final AbstractStringMetric metric;
public NumericSimilarityMetric() {
// I don't exactly know why, but I get a good matching behavior
// when using QGramsDistance or BlockDistance
metric = new QGramsDistance(new NumberTokeniser());
}
private final StringMetric metric = with(new BlockDistance<String>()).tokenize(new NumberTokeniser()).build();
@Override
public float getSimilarity(Object o1, Object o2) {
return metric.getSimilarity(normalize(o1), normalize(o2));
return metric.compare(normalize(o1), normalize(o2));
}
protected String normalize(Object object) {
@ -34,40 +24,12 @@ public class NumericSimilarityMetric implements SimilarityMetric {
return object.toString();
}
private static class NumberTokeniser implements InterfaceTokeniser {
private static class NumberTokeniser extends AbstractTokenizer {
@Override
public ArrayList<String> tokenizeToArrayList(String s) {
return matchIntegers(s).stream().map(String::valueOf).collect(toCollection(ArrayList::new));
public List<String> tokenizeToList(String input) {
return matchIntegers(input).stream().map(String::valueOf).collect(toList());
}
@Override
public String getDelimiters() {
return NON_DIGIT.pattern();
}
@Override
public Set<String> tokenizeToSet(String input) {
return new LinkedHashSet<String>(tokenizeToArrayList(input));
}
@Override
public String getShortDescriptionString() {
return getClass().getSimpleName();
}
private InterfaceTermHandler stopWordHandler = new DummyStopTermHandler();
@Override
public InterfaceTermHandler getStopWordHandler() {
return stopWordHandler;
}
@Override
public void setStopWordHandler(InterfaceTermHandler stopWordHandler) {
this.stopWordHandler = stopWordHandler;
}
}
}

View File

@ -5,6 +5,8 @@ import static java.util.Collections.reverseOrder;
import static java.util.Comparator.*;
import static java.util.stream.Collectors.*;
import static net.filebot.similarity.Normalization.*;
import static org.simmetrics.builders.StringMetricBuilder.*;
import static org.simmetrics.tokenizers.Tokenizers.*;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.Collection;
@ -16,21 +18,22 @@ import java.util.concurrent.ExecutionException;
import java.util.function.Function;
import java.util.stream.IntStream;
import com.ibm.icu.text.Transliterator;
import org.simmetrics.StringMetric;
import org.simmetrics.metrics.BlockDistance;
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
import com.ibm.icu.text.Transliterator;
public class LocalSearch<T> {
private AbstractStringMetric metric = new QGramsDistance();
private float resultMinimumSimilarity = 0.5f;
private int resultSetSize = 20;
private final StringMetric metric = with(new BlockDistance<String>()).tokenize(qGramWithPadding(3)).build();
private Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
private T[] objects;
private Set<String>[] fields;
private final float resultMinimumSimilarity = 0.5f;
private final int resultSetSize = 20;
private final T[] objects;
private final Set<String>[] fields;
public LocalSearch(T[] data, Function<T, Collection<String>> keywords) {
objects = data.clone();
@ -45,20 +48,12 @@ public class LocalSearch<T> {
Set<String> field = fields[i];
boolean match = field.stream().anyMatch(it -> it.contains(query));
double similarity = field.stream().mapToDouble(it -> metric.getSimilarity(query, it)).max().orElse(0);
double similarity = field.stream().mapToDouble(it -> metric.compare(query, it)).max().orElse(0);
return match || similarity > resultMinimumSimilarity ? new SimpleImmutableEntry<T, Double>(object, similarity) : null;
}).filter(Objects::nonNull).sorted(reverseOrder(comparing(Entry::getValue))).limit(resultSetSize).map(Entry::getKey).collect(toList());
}
public void setResultMinimumSimilarity(float resultMinimumSimilarity) {
this.resultMinimumSimilarity = resultMinimumSimilarity;
}
public void setResultSetSize(int resultSetSize) {
this.resultSetSize = resultSetSize;
}
protected Set<String> normalize(Collection<String> values) {
return values.stream().map(this::normalize).collect(toSet());
}