filebot/source/net/filebot/similarity/NumericSimilarityMetric.java

96 lines
2.3 KiB
Java
Raw Normal View History

2014-04-19 02:30:29 -04:00
package net.filebot.similarity;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.Scanner;
import java.util.Set;
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
import uk.ac.shef.wit.simmetrics.tokenisers.InterfaceTokeniser;
import uk.ac.shef.wit.simmetrics.wordhandlers.DummyStopTermHandler;
import uk.ac.shef.wit.simmetrics.wordhandlers.InterfaceTermHandler;
public class NumericSimilarityMetric implements SimilarityMetric {
2015-07-25 18:47:19 -04:00
private final AbstractStringMetric metric;
2015-07-25 18:47:19 -04:00
public NumericSimilarityMetric() {
2015-07-25 18:47:19 -04:00
// I don't exactly know why, but I get a good matching behavior
// when using QGramsDistance or BlockDistance
metric = new QGramsDistance(new NumberTokeniser());
}
2015-07-25 18:47:19 -04:00
@Override
public float getSimilarity(Object o1, Object o2) {
return metric.getSimilarity(normalize(o1), normalize(o2));
}
2015-07-25 18:47:19 -04:00
protected String normalize(Object object) {
// no need to do anything special here, because we don't care about anything but number patterns anyway
return object.toString();
}
2015-07-25 18:47:19 -04:00
private static class NumberTokeniser implements InterfaceTokeniser {
2015-07-25 18:47:19 -04:00
private final String delimiter = "\\D+";
2015-07-25 18:47:19 -04:00
@Override
public ArrayList<String> tokenizeToArrayList(String input) {
ArrayList<String> tokens = new ArrayList<String>();
2015-07-25 18:47:19 -04:00
// scan for number patterns, use non-number pattern as delimiter
Scanner scanner = new Scanner(input).useDelimiter(delimiter);
2015-07-25 18:47:19 -04:00
while (scanner.hasNextInt()) {
// remove leading zeros from number tokens by scanning for Integers
tokens.add(String.valueOf(scanner.nextInt()));
}
2015-07-25 18:47:19 -04:00
return tokens;
}
2015-07-25 18:47:19 -04:00
@Override
public Set<String> tokenizeToSet(String input) {
return new LinkedHashSet<String>(tokenizeToArrayList(input));
}
2015-07-25 18:47:19 -04:00
@Override
public String getShortDescriptionString() {
return getClass().getSimpleName();
}
2015-07-25 18:47:19 -04:00
@Override
public String getDelimiters() {
return delimiter;
}
2015-07-25 18:47:19 -04:00
private InterfaceTermHandler stopWordHandler = new DummyStopTermHandler();
2015-07-25 18:47:19 -04:00
@Override
public InterfaceTermHandler getStopWordHandler() {
return stopWordHandler;
}
2015-07-25 18:47:19 -04:00
@Override
public void setStopWordHandler(InterfaceTermHandler stopWordHandler) {
this.stopWordHandler = stopWordHandler;
}
2015-07-25 18:47:19 -04:00
}
2015-07-25 18:47:19 -04:00
}