mirror of
https://github.com/mitb-archive/filebot
synced 2024-12-22 15:58:52 -05:00
Refactor string similarity code
This commit is contained in:
parent
dfa4f78448
commit
9a228e6927
1
ivy.xml
1
ivy.xml
@ -7,6 +7,7 @@
|
||||
<dependency rev="4.5.2" org="net.java.dev.jna" name="jna-platform" />
|
||||
<dependency rev="2.33" org="args4j" name="args4j" />
|
||||
<dependency rev="2.10.5" org="net.sf.ehcache" name="ehcache" />
|
||||
<dependency rev="4.1.1" org="com.github.mpkorstanje" name="simmetrics-core" />
|
||||
<dependency rev="61.1" org="com.ibm.icu" name="icu4j" />
|
||||
<dependency rev="1.11.3" org="org.jsoup" name="jsoup" />
|
||||
<dependency rev="1.8" org="org.tukaani" name="xz" />
|
||||
|
@ -15,6 +15,7 @@ ivy/jar/jaxb-impl.jar
|
||||
ivy/jar/junrar.jar
|
||||
ivy/jar/jna.jar
|
||||
ivy/jar/jna-platform.jar
|
||||
ivy/jar/simmetrics-core.jar
|
||||
ivy/jar/streamex.jar
|
||||
ivy/jar/icu4j.jar
|
||||
ivy/jar/language-detector.jar
|
||||
@ -53,5 +54,4 @@ ivy/bundle/json-io.jar
|
||||
ivy/bundle/guava.jar
|
||||
jars/xmlrpc.jar
|
||||
jars/ObjCBridge.jar
|
||||
jars/simmetrics.jar
|
||||
jars/jacksum.jar
|
Binary file not shown.
@ -1,40 +1,26 @@
|
||||
|
||||
package net.filebot.similarity;
|
||||
|
||||
|
||||
import static net.filebot.similarity.Normalization.*;
|
||||
import static org.simmetrics.builders.StringMetricBuilder.*;
|
||||
import static org.simmetrics.tokenizers.Tokenizers.*;
|
||||
|
||||
import org.simmetrics.StringMetric;
|
||||
import org.simmetrics.metrics.BlockDistance;
|
||||
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
|
||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
|
||||
import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserQGram3;
|
||||
|
||||
|
||||
public class NameSimilarityMetric implements SimilarityMetric {
|
||||
|
||||
private final AbstractStringMetric metric;
|
||||
private final Transliterator transliterator;
|
||||
|
||||
|
||||
public NameSimilarityMetric() {
|
||||
// QGramsDistance with a QGram tokenizer seems to work best for similarity of names
|
||||
this(new QGramsDistance(new TokeniserQGram3()), Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"));
|
||||
}
|
||||
|
||||
|
||||
public NameSimilarityMetric(AbstractStringMetric metric, Transliterator transliterator) {
|
||||
this.metric = metric;
|
||||
this.transliterator = transliterator;
|
||||
}
|
||||
private final StringMetric metric = with(new BlockDistance<String>()).tokenize(qGramWithPadding(3)).build();
|
||||
|
||||
private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
|
||||
|
||||
@Override
|
||||
public float getSimilarity(Object o1, Object o2) {
|
||||
return metric.getSimilarity(normalize(o1), normalize(o2));
|
||||
return metric.compare(normalize(o1), normalize(o2));
|
||||
}
|
||||
|
||||
|
||||
protected String normalize(Object object) {
|
||||
// use string representation
|
||||
String name = object.toString();
|
||||
|
@ -1,32 +1,22 @@
|
||||
package net.filebot.similarity;
|
||||
|
||||
import static java.util.stream.Collectors.*;
|
||||
import static net.filebot.util.RegularExpressions.*;
|
||||
import static net.filebot.util.StringUtilities.*;
|
||||
import static org.simmetrics.builders.StringMetricBuilder.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Set;
|
||||
import java.util.List;
|
||||
|
||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
|
||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
|
||||
import uk.ac.shef.wit.simmetrics.tokenisers.InterfaceTokeniser;
|
||||
import uk.ac.shef.wit.simmetrics.wordhandlers.DummyStopTermHandler;
|
||||
import uk.ac.shef.wit.simmetrics.wordhandlers.InterfaceTermHandler;
|
||||
import org.simmetrics.StringMetric;
|
||||
import org.simmetrics.metrics.BlockDistance;
|
||||
import org.simmetrics.tokenizers.AbstractTokenizer;
|
||||
|
||||
public class NumericSimilarityMetric implements SimilarityMetric {
|
||||
|
||||
private final AbstractStringMetric metric;
|
||||
|
||||
public NumericSimilarityMetric() {
|
||||
// I don't exactly know why, but I get a good matching behavior
|
||||
// when using QGramsDistance or BlockDistance
|
||||
metric = new QGramsDistance(new NumberTokeniser());
|
||||
}
|
||||
private final StringMetric metric = with(new BlockDistance<String>()).tokenize(new NumberTokeniser()).build();
|
||||
|
||||
@Override
|
||||
public float getSimilarity(Object o1, Object o2) {
|
||||
return metric.getSimilarity(normalize(o1), normalize(o2));
|
||||
return metric.compare(normalize(o1), normalize(o2));
|
||||
}
|
||||
|
||||
protected String normalize(Object object) {
|
||||
@ -34,40 +24,12 @@ public class NumericSimilarityMetric implements SimilarityMetric {
|
||||
return object.toString();
|
||||
}
|
||||
|
||||
private static class NumberTokeniser implements InterfaceTokeniser {
|
||||
private static class NumberTokeniser extends AbstractTokenizer {
|
||||
|
||||
@Override
|
||||
public ArrayList<String> tokenizeToArrayList(String s) {
|
||||
return matchIntegers(s).stream().map(String::valueOf).collect(toCollection(ArrayList::new));
|
||||
public List<String> tokenizeToList(String input) {
|
||||
return matchIntegers(input).stream().map(String::valueOf).collect(toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getDelimiters() {
|
||||
return NON_DIGIT.pattern();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<String> tokenizeToSet(String input) {
|
||||
return new LinkedHashSet<String>(tokenizeToArrayList(input));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getShortDescriptionString() {
|
||||
return getClass().getSimpleName();
|
||||
}
|
||||
|
||||
private InterfaceTermHandler stopWordHandler = new DummyStopTermHandler();
|
||||
|
||||
@Override
|
||||
public InterfaceTermHandler getStopWordHandler() {
|
||||
return stopWordHandler;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setStopWordHandler(InterfaceTermHandler stopWordHandler) {
|
||||
this.stopWordHandler = stopWordHandler;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,6 +5,8 @@ import static java.util.Collections.reverseOrder;
|
||||
import static java.util.Comparator.*;
|
||||
import static java.util.stream.Collectors.*;
|
||||
import static net.filebot.similarity.Normalization.*;
|
||||
import static org.simmetrics.builders.StringMetricBuilder.*;
|
||||
import static org.simmetrics.tokenizers.Tokenizers.*;
|
||||
|
||||
import java.util.AbstractMap.SimpleImmutableEntry;
|
||||
import java.util.Collection;
|
||||
@ -16,21 +18,22 @@ import java.util.concurrent.ExecutionException;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import org.simmetrics.StringMetric;
|
||||
import org.simmetrics.metrics.BlockDistance;
|
||||
|
||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
|
||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
public class LocalSearch<T> {
|
||||
|
||||
private AbstractStringMetric metric = new QGramsDistance();
|
||||
private float resultMinimumSimilarity = 0.5f;
|
||||
private int resultSetSize = 20;
|
||||
private final StringMetric metric = with(new BlockDistance<String>()).tokenize(qGramWithPadding(3)).build();
|
||||
|
||||
private Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
|
||||
private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
|
||||
|
||||
private T[] objects;
|
||||
private Set<String>[] fields;
|
||||
private final float resultMinimumSimilarity = 0.5f;
|
||||
private final int resultSetSize = 20;
|
||||
|
||||
private final T[] objects;
|
||||
private final Set<String>[] fields;
|
||||
|
||||
public LocalSearch(T[] data, Function<T, Collection<String>> keywords) {
|
||||
objects = data.clone();
|
||||
@ -45,20 +48,12 @@ public class LocalSearch<T> {
|
||||
Set<String> field = fields[i];
|
||||
|
||||
boolean match = field.stream().anyMatch(it -> it.contains(query));
|
||||
double similarity = field.stream().mapToDouble(it -> metric.getSimilarity(query, it)).max().orElse(0);
|
||||
double similarity = field.stream().mapToDouble(it -> metric.compare(query, it)).max().orElse(0);
|
||||
|
||||
return match || similarity > resultMinimumSimilarity ? new SimpleImmutableEntry<T, Double>(object, similarity) : null;
|
||||
}).filter(Objects::nonNull).sorted(reverseOrder(comparing(Entry::getValue))).limit(resultSetSize).map(Entry::getKey).collect(toList());
|
||||
}
|
||||
|
||||
public void setResultMinimumSimilarity(float resultMinimumSimilarity) {
|
||||
this.resultMinimumSimilarity = resultMinimumSimilarity;
|
||||
}
|
||||
|
||||
public void setResultSetSize(int resultSetSize) {
|
||||
this.resultSetSize = resultSetSize;
|
||||
}
|
||||
|
||||
protected Set<String> normalize(Collection<String> values) {
|
||||
return values.stream().map(this::normalize).collect(toSet());
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user