* improved matching by using existing common word sequence name matching

This commit is contained in:
Reinhard Pointner 2012-01-03 09:23:03 +00:00
parent 4d45826540
commit 52b302e3a4
4 changed files with 101 additions and 7 deletions

View File

@ -46,7 +46,7 @@ import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult;
public class MediaDetection {
private static ReleaseInfo releaseInfo = new ReleaseInfo();
private static final ReleaseInfo releaseInfo = new ReleaseInfo();
public static Map<Set<File>, Set<String>> mapSeriesNamesByFiles(Collection<File> files, Locale locale) throws Exception {

View File

@ -174,6 +174,24 @@ public enum EpisodeMetrics implements SimilarityMetric {
}
}),
// Match via common word sequence in episode name and file name
SubstringSequence(new SequenceMatchSimilarity() {
@Override
public float getSimilarity(Object o1, Object o2) {
// normalize absolute similarity to similarity rank (5 ranks in total),
// so we are less likely to fall for false positives in this pass, and move on to the next one
return (float) (floor(super.getSimilarity(o1, o2) * 5) / 5);
}
@Override
protected String normalize(Object object) {
// simplify file name, if possible
return normalizeObject(object);
}
}),
// Match by generic name similarity
Name(new NameSimilarityMetric() {
@ -273,7 +291,15 @@ public enum EpisodeMetrics implements SimilarityMetric {
}
private static final Map<Object, String> transformCache = synchronizedMap(new WeakHashMap<Object, String>(64, 4));
protected static String normalizeObject(Object object) {
String result = transformCache.get(object);
if (result != null) {
return result;
}
String name = object.toString();
// use name without extension
@ -289,7 +315,11 @@ public enum EpisodeMetrics implements SimilarityMetric {
// remove/normalize special characters
name = normalizePunctuation(name);
return name.toLowerCase();
// normalize to lower case
name.toLowerCase();
transformCache.put(object, name);
return name;
}
@ -301,9 +331,9 @@ public enum EpisodeMetrics implements SimilarityMetric {
// 6 pass: divide by generic numeric similarity
// 7 pass: resolve remaining collisions via absolute string similarity
if (includeFileMetrics) {
return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, Name, Numeric, new NameSimilarityMetric() };
return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, new MetricCascade(SubstringSequence, Name), Numeric };
} else {
return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, Name, Numeric, new NameSimilarityMetric() };
return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, new MetricCascade(SubstringSequence, Name), Numeric };
}
}

View File

@ -0,0 +1,58 @@
package net.sourceforge.filebot.similarity;
import static java.lang.Math.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import java.text.Collator;
import java.util.Comparator;
import java.util.Locale;
public class SequenceMatchSimilarity implements SimilarityMetric {
@Override
public float getSimilarity(Object o1, Object o2) {
String s1 = normalize(o1);
String s2 = normalize(o2);
// match common word sequence
String match = match(s1, s2);
if (match == null)
return 0;
return (float) match.length() / min(s1.length(), s2.length());
}
protected String normalize(Object object) {
// use string representation
String name = object.toString();
// normalize separators
name = normalizePunctuation(name);
// normalize case and trim
return name.trim().toLowerCase();
}
protected String match(String s1, String s2) {
// use maximum strength collator by default
Collator collator = Collator.getInstance(Locale.ROOT);
collator.setDecomposition(Collator.FULL_DECOMPOSITION);
collator.setStrength(Collator.TERTIARY);
@SuppressWarnings("unchecked")
SeriesNameMatcher matcher = new SeriesNameMatcher((Comparator) collator, 10) {
@Override
protected String normalize(String name) {
return name; // assume normalization has been done, no need to do that here again
};
};
return matcher.matchByFirstCommonWordSequence(s1, s2);
}
}

View File

@ -33,17 +33,23 @@ public class SeriesNameMatcher {
protected SeasonEpisodeMatcher seasonEpisodeMatcher = new SeasonEpisodeMatcher(new SeasonEpisodeFilter(30, 50, -1), true);
protected NameSimilarityMetric nameSimilarityMetric = new NameSimilarityMetric();
protected int commonWordSequenceMaxStartIndex = 3;
protected int commonWordSequenceMaxStartIndex;
protected Comparator<String> commonWordComparator;
public SeriesNameMatcher() {
this(String.CASE_INSENSITIVE_ORDER);
this(String.CASE_INSENSITIVE_ORDER, 3);
}
public SeriesNameMatcher(Comparator<String> comparator) {
this.commonWordComparator = comparator;
this(comparator, 3);
}
public SeriesNameMatcher(Comparator<String> commonWordComparator, int commonWordSequenceMaxStartIndex) {
this.commonWordSequenceMaxStartIndex = commonWordSequenceMaxStartIndex;
this.commonWordComparator = commonWordComparator;
}