* reuse name normalization code

This commit is contained in:
Reinhard Pointner 2012-01-02 03:34:13 +00:00
parent b8c96b8fbe
commit 6707a94518
6 changed files with 61 additions and 35 deletions

View File

@ -22,7 +22,7 @@ public final class VerificationUtilities {
*/
public static final Pattern EMBEDDED_CHECKSUM = Pattern.compile("(?<=\\[|\\()(\\p{XDigit}{8})(?=\\]|\\))");
public static String getEmbeddedChecksum(CharSequence string) {
Matcher matcher = EMBEDDED_CHECKSUM.matcher(string);
String embeddedChecksum = null;
@ -35,18 +35,12 @@ public final class VerificationUtilities {
return embeddedChecksum;
}
public static String removeEmbeddedChecksum(String string) {
// match embedded checksum and surrounding brackets
return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", "");
}
public static String getHashFromVerificationFile(File file, HashType type, int maxDepth) throws IOException {
return getHashFromVerificationFile(file.getParentFile(), file, type, 0, maxDepth);
}
private static String getHashFromVerificationFile(File folder, File target, HashType type, int depth, int maxDepth) throws IOException {
// stop if we reached max depth or the file system root
if (folder == null || depth > maxDepth)
@ -75,7 +69,7 @@ public final class VerificationUtilities {
return getHashFromVerificationFile(folder.getParentFile(), target, type, depth + 1, maxDepth);
}
public static HashType getHashType(File verificationFile) {
for (HashType hashType : HashType.values()) {
if (hashType.getFilter().accept(verificationFile))
@ -85,7 +79,7 @@ public final class VerificationUtilities {
return null;
}
public static HashType getHashTypeByExtension(String extension) {
for (HashType hashType : HashType.values()) {
if (hashType.getFilter().acceptExtension(extension))
@ -95,7 +89,7 @@ public final class VerificationUtilities {
return null;
}
public static String computeHash(File file, HashType type) throws IOException, InterruptedException {
Hash hash = type.newHash();
@ -120,7 +114,7 @@ public final class VerificationUtilities {
return hash.digest();
}
/**
* Dummy constructor to prevent instantiation.
*/

View File

@ -5,7 +5,7 @@ package net.sourceforge.filebot.similarity;
import static java.lang.Math.*;
import static java.util.Arrays.*;
import static java.util.Collections.*;
import static net.sourceforge.filebot.hash.VerificationUtilities.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import static net.sourceforge.tuned.FileUtilities.*;
import java.io.File;
@ -287,10 +287,9 @@ public enum EpisodeMetrics implements SimilarityMetric {
name = removeEmbeddedChecksum(name);
// remove/normalize special characters
name = name.replaceAll("['`´]+", "");
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
name = normalizePunctuation(name);
return name.trim().toLowerCase();
return name.toLowerCase();
}

View File

@ -2,6 +2,7 @@
package net.sourceforge.filebot.similarity;
import static net.sourceforge.filebot.similarity.Normalization.*;
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserQGram3;
@ -11,28 +12,28 @@ public class NameSimilarityMetric implements SimilarityMetric {
private final AbstractStringMetric metric;
public NameSimilarityMetric() {
// QGramsDistance with a QGram tokenizer seems to work best for similarity of names
metric = new QGramsDistance(new TokeniserQGram3());
}
@Override
public float getSimilarity(Object o1, Object o2) {
return metric.getSimilarity(normalize(o1), normalize(o2));
}
protected String normalize(Object object) {
// use string representation
String name = object.toString();
// normalize separators
name = name.replaceAll("['`´]+", "").replaceAll("[\\p{Punct}\\p{Space}]+", " ");
name = normalizePunctuation(name);
// normalize case and trim
return name.trim().toLowerCase();
return name.toLowerCase();
}
}

View File

@ -0,0 +1,31 @@
package net.sourceforge.filebot.similarity;
public class Normalization {
public static String normalizePunctuation(String name) {
// remove/normalize special characters
name = name.replaceAll("['`´]+", "");
name = name.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
return name.trim();
}
public static String normalizeBrackets(String name) {
// remove group names and checksums, any [...] or (...)
name = name.replaceAll("\\([^\\(]*\\)", " ");
name = name.replaceAll("\\[[^\\[]*\\]", " ");
name = name.replaceAll("\\{[^\\{]*\\}", " ");
return name;
}
public static String removeEmbeddedChecksum(String string) {
// match embedded checksum and surrounding brackets
return string.replaceAll("[\\(\\[]\\p{XDigit}{8}[\\]\\)]", "");
}
}

View File

@ -2,6 +2,9 @@
package net.sourceforge.filebot.similarity;
import static net.sourceforge.filebot.similarity.Normalization.*;
public class SubstringMetric implements SimilarityMetric {
@Override
@ -17,13 +20,13 @@ public class SubstringMetric implements SimilarityMetric {
return s1.contains(s2) || s2.contains(s1) ? 1 : 0;
}
protected String normalize(Object object) {
// use string representation
String name = object.toString();
// normalize separators
name = name.replaceAll("['`´]+", "").replaceAll("[\\p{Punct}\\p{Space}]+", " ");
name = normalizePunctuation(name);
// normalize case and trim
return name.trim().toLowerCase();

View File

@ -3,16 +3,17 @@ package net.sourceforge.filebot.web;
import static java.util.Collections.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import java.util.AbstractList;
import java.util.AbstractMap.SimpleEntry;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.AbstractMap.SimpleEntry;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
@ -32,7 +33,7 @@ class LocalSearch<T> {
private final List<T> objects;
private final List<Set<String>> fields;
public LocalSearch(Collection<? extends T> data) {
objects = new ArrayList<T>(data);
fields = new ArrayList<Set<String>>(objects.size());
@ -42,7 +43,7 @@ class LocalSearch<T> {
}
}
public List<T> search(String query) throws ExecutionException, InterruptedException {
final String q = normalize(query);
List<Callable<Entry<T, Float>>> tasks = new ArrayList<Callable<Entry<T, Float>>>(objects.size());
@ -96,7 +97,7 @@ class LocalSearch<T> {
return resultSet.get(index).getKey();
}
@Override
public int size() {
return Math.min(resultSetSize, resultSet.size());
@ -104,12 +105,12 @@ class LocalSearch<T> {
};
}
protected Set<String> getFields(T object) {
return set(object.toString());
}
protected Set<String> set(String... values) {
Set<String> set = new HashSet<String>(values.length);
for (String value : values) {
@ -120,13 +121,10 @@ class LocalSearch<T> {
return set;
}
protected String normalize(String value) {
// normalize separator, normalize case and trim
value = value.replaceAll("['`´]+", "");
value = value.replaceAll("[\\p{Punct}\\p{Space}]+", " ");
return value.trim().toLowerCase();
return normalizePunctuation(value).toLowerCase();
}
}