mirror of
https://github.com/mitb-archive/filebot
synced 2025-01-08 12:28:04 -05:00
Refactor string similarity code
This commit is contained in:
parent
dfa4f78448
commit
9a228e6927
1
ivy.xml
1
ivy.xml
@ -7,6 +7,7 @@
|
|||||||
<dependency rev="4.5.2" org="net.java.dev.jna" name="jna-platform" />
|
<dependency rev="4.5.2" org="net.java.dev.jna" name="jna-platform" />
|
||||||
<dependency rev="2.33" org="args4j" name="args4j" />
|
<dependency rev="2.33" org="args4j" name="args4j" />
|
||||||
<dependency rev="2.10.5" org="net.sf.ehcache" name="ehcache" />
|
<dependency rev="2.10.5" org="net.sf.ehcache" name="ehcache" />
|
||||||
|
<dependency rev="4.1.1" org="com.github.mpkorstanje" name="simmetrics-core" />
|
||||||
<dependency rev="61.1" org="com.ibm.icu" name="icu4j" />
|
<dependency rev="61.1" org="com.ibm.icu" name="icu4j" />
|
||||||
<dependency rev="1.11.3" org="org.jsoup" name="jsoup" />
|
<dependency rev="1.11.3" org="org.jsoup" name="jsoup" />
|
||||||
<dependency rev="1.8" org="org.tukaani" name="xz" />
|
<dependency rev="1.8" org="org.tukaani" name="xz" />
|
||||||
|
@ -15,6 +15,7 @@ ivy/jar/jaxb-impl.jar
|
|||||||
ivy/jar/junrar.jar
|
ivy/jar/junrar.jar
|
||||||
ivy/jar/jna.jar
|
ivy/jar/jna.jar
|
||||||
ivy/jar/jna-platform.jar
|
ivy/jar/jna-platform.jar
|
||||||
|
ivy/jar/simmetrics-core.jar
|
||||||
ivy/jar/streamex.jar
|
ivy/jar/streamex.jar
|
||||||
ivy/jar/icu4j.jar
|
ivy/jar/icu4j.jar
|
||||||
ivy/jar/language-detector.jar
|
ivy/jar/language-detector.jar
|
||||||
@ -53,5 +54,4 @@ ivy/bundle/json-io.jar
|
|||||||
ivy/bundle/guava.jar
|
ivy/bundle/guava.jar
|
||||||
jars/xmlrpc.jar
|
jars/xmlrpc.jar
|
||||||
jars/ObjCBridge.jar
|
jars/ObjCBridge.jar
|
||||||
jars/simmetrics.jar
|
|
||||||
jars/jacksum.jar
|
jars/jacksum.jar
|
Binary file not shown.
@ -1,40 +1,26 @@
|
|||||||
|
|
||||||
package net.filebot.similarity;
|
package net.filebot.similarity;
|
||||||
|
|
||||||
|
|
||||||
import static net.filebot.similarity.Normalization.*;
|
import static net.filebot.similarity.Normalization.*;
|
||||||
|
import static org.simmetrics.builders.StringMetricBuilder.*;
|
||||||
|
import static org.simmetrics.tokenizers.Tokenizers.*;
|
||||||
|
|
||||||
|
import org.simmetrics.StringMetric;
|
||||||
|
import org.simmetrics.metrics.BlockDistance;
|
||||||
|
|
||||||
import com.ibm.icu.text.Transliterator;
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
|
||||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
|
|
||||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
|
|
||||||
import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserQGram3;
|
|
||||||
|
|
||||||
|
|
||||||
public class NameSimilarityMetric implements SimilarityMetric {
|
public class NameSimilarityMetric implements SimilarityMetric {
|
||||||
|
|
||||||
private final AbstractStringMetric metric;
|
private final StringMetric metric = with(new BlockDistance<String>()).tokenize(qGramWithPadding(3)).build();
|
||||||
private final Transliterator transliterator;
|
|
||||||
|
|
||||||
|
|
||||||
public NameSimilarityMetric() {
|
|
||||||
// QGramsDistance with a QGram tokenizer seems to work best for similarity of names
|
|
||||||
this(new QGramsDistance(new TokeniserQGram3()), Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove"));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public NameSimilarityMetric(AbstractStringMetric metric, Transliterator transliterator) {
|
|
||||||
this.metric = metric;
|
|
||||||
this.transliterator = transliterator;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public float getSimilarity(Object o1, Object o2) {
|
public float getSimilarity(Object o1, Object o2) {
|
||||||
return metric.getSimilarity(normalize(o1), normalize(o2));
|
return metric.compare(normalize(o1), normalize(o2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected String normalize(Object object) {
|
protected String normalize(Object object) {
|
||||||
// use string representation
|
// use string representation
|
||||||
String name = object.toString();
|
String name = object.toString();
|
||||||
|
@ -1,32 +1,22 @@
|
|||||||
package net.filebot.similarity;
|
package net.filebot.similarity;
|
||||||
|
|
||||||
import static java.util.stream.Collectors.*;
|
import static java.util.stream.Collectors.*;
|
||||||
import static net.filebot.util.RegularExpressions.*;
|
|
||||||
import static net.filebot.util.StringUtilities.*;
|
import static net.filebot.util.StringUtilities.*;
|
||||||
|
import static org.simmetrics.builders.StringMetricBuilder.*;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.List;
|
||||||
import java.util.LinkedHashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
|
import org.simmetrics.StringMetric;
|
||||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
|
import org.simmetrics.metrics.BlockDistance;
|
||||||
import uk.ac.shef.wit.simmetrics.tokenisers.InterfaceTokeniser;
|
import org.simmetrics.tokenizers.AbstractTokenizer;
|
||||||
import uk.ac.shef.wit.simmetrics.wordhandlers.DummyStopTermHandler;
|
|
||||||
import uk.ac.shef.wit.simmetrics.wordhandlers.InterfaceTermHandler;
|
|
||||||
|
|
||||||
public class NumericSimilarityMetric implements SimilarityMetric {
|
public class NumericSimilarityMetric implements SimilarityMetric {
|
||||||
|
|
||||||
private final AbstractStringMetric metric;
|
private final StringMetric metric = with(new BlockDistance<String>()).tokenize(new NumberTokeniser()).build();
|
||||||
|
|
||||||
public NumericSimilarityMetric() {
|
|
||||||
// I don't exactly know why, but I get a good matching behavior
|
|
||||||
// when using QGramsDistance or BlockDistance
|
|
||||||
metric = new QGramsDistance(new NumberTokeniser());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public float getSimilarity(Object o1, Object o2) {
|
public float getSimilarity(Object o1, Object o2) {
|
||||||
return metric.getSimilarity(normalize(o1), normalize(o2));
|
return metric.compare(normalize(o1), normalize(o2));
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String normalize(Object object) {
|
protected String normalize(Object object) {
|
||||||
@ -34,40 +24,12 @@ public class NumericSimilarityMetric implements SimilarityMetric {
|
|||||||
return object.toString();
|
return object.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class NumberTokeniser implements InterfaceTokeniser {
|
private static class NumberTokeniser extends AbstractTokenizer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ArrayList<String> tokenizeToArrayList(String s) {
|
public List<String> tokenizeToList(String input) {
|
||||||
return matchIntegers(s).stream().map(String::valueOf).collect(toCollection(ArrayList::new));
|
return matchIntegers(input).stream().map(String::valueOf).collect(toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getDelimiters() {
|
|
||||||
return NON_DIGIT.pattern();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Set<String> tokenizeToSet(String input) {
|
|
||||||
return new LinkedHashSet<String>(tokenizeToArrayList(input));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String getShortDescriptionString() {
|
|
||||||
return getClass().getSimpleName();
|
|
||||||
}
|
|
||||||
|
|
||||||
private InterfaceTermHandler stopWordHandler = new DummyStopTermHandler();
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public InterfaceTermHandler getStopWordHandler() {
|
|
||||||
return stopWordHandler;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setStopWordHandler(InterfaceTermHandler stopWordHandler) {
|
|
||||||
this.stopWordHandler = stopWordHandler;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,8 @@ import static java.util.Collections.reverseOrder;
|
|||||||
import static java.util.Comparator.*;
|
import static java.util.Comparator.*;
|
||||||
import static java.util.stream.Collectors.*;
|
import static java.util.stream.Collectors.*;
|
||||||
import static net.filebot.similarity.Normalization.*;
|
import static net.filebot.similarity.Normalization.*;
|
||||||
|
import static org.simmetrics.builders.StringMetricBuilder.*;
|
||||||
|
import static org.simmetrics.tokenizers.Tokenizers.*;
|
||||||
|
|
||||||
import java.util.AbstractMap.SimpleImmutableEntry;
|
import java.util.AbstractMap.SimpleImmutableEntry;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
@ -16,21 +18,22 @@ import java.util.concurrent.ExecutionException;
|
|||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.stream.IntStream;
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
import com.ibm.icu.text.Transliterator;
|
import org.simmetrics.StringMetric;
|
||||||
|
import org.simmetrics.metrics.BlockDistance;
|
||||||
|
|
||||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
|
import com.ibm.icu.text.Transliterator;
|
||||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
|
|
||||||
|
|
||||||
public class LocalSearch<T> {
|
public class LocalSearch<T> {
|
||||||
|
|
||||||
private AbstractStringMetric metric = new QGramsDistance();
|
private final StringMetric metric = with(new BlockDistance<String>()).tokenize(qGramWithPadding(3)).build();
|
||||||
private float resultMinimumSimilarity = 0.5f;
|
|
||||||
private int resultSetSize = 20;
|
|
||||||
|
|
||||||
private Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
|
private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
|
||||||
|
|
||||||
private T[] objects;
|
private final float resultMinimumSimilarity = 0.5f;
|
||||||
private Set<String>[] fields;
|
private final int resultSetSize = 20;
|
||||||
|
|
||||||
|
private final T[] objects;
|
||||||
|
private final Set<String>[] fields;
|
||||||
|
|
||||||
public LocalSearch(T[] data, Function<T, Collection<String>> keywords) {
|
public LocalSearch(T[] data, Function<T, Collection<String>> keywords) {
|
||||||
objects = data.clone();
|
objects = data.clone();
|
||||||
@ -45,20 +48,12 @@ public class LocalSearch<T> {
|
|||||||
Set<String> field = fields[i];
|
Set<String> field = fields[i];
|
||||||
|
|
||||||
boolean match = field.stream().anyMatch(it -> it.contains(query));
|
boolean match = field.stream().anyMatch(it -> it.contains(query));
|
||||||
double similarity = field.stream().mapToDouble(it -> metric.getSimilarity(query, it)).max().orElse(0);
|
double similarity = field.stream().mapToDouble(it -> metric.compare(query, it)).max().orElse(0);
|
||||||
|
|
||||||
return match || similarity > resultMinimumSimilarity ? new SimpleImmutableEntry<T, Double>(object, similarity) : null;
|
return match || similarity > resultMinimumSimilarity ? new SimpleImmutableEntry<T, Double>(object, similarity) : null;
|
||||||
}).filter(Objects::nonNull).sorted(reverseOrder(comparing(Entry::getValue))).limit(resultSetSize).map(Entry::getKey).collect(toList());
|
}).filter(Objects::nonNull).sorted(reverseOrder(comparing(Entry::getValue))).limit(resultSetSize).map(Entry::getKey).collect(toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setResultMinimumSimilarity(float resultMinimumSimilarity) {
|
|
||||||
this.resultMinimumSimilarity = resultMinimumSimilarity;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setResultSetSize(int resultSetSize) {
|
|
||||||
this.resultSetSize = resultSetSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Set<String> normalize(Collection<String> values) {
|
protected Set<String> normalize(Collection<String> values) {
|
||||||
return values.stream().map(this::normalize).collect(toSet());
|
return values.stream().map(this::normalize).collect(toSet());
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user