mirror of
https://github.com/mitb-archive/filebot
synced 2024-12-23 16:28:51 -05:00
* improved matching by using existing common word sequence name matching
This commit is contained in:
parent
4d45826540
commit
52b302e3a4
@ -46,7 +46,7 @@ import net.sourceforge.filebot.web.TheTVDBClient.TheTVDBSearchResult;
|
||||
|
||||
public class MediaDetection {
|
||||
|
||||
private static ReleaseInfo releaseInfo = new ReleaseInfo();
|
||||
private static final ReleaseInfo releaseInfo = new ReleaseInfo();
|
||||
|
||||
|
||||
public static Map<Set<File>, Set<String>> mapSeriesNamesByFiles(Collection<File> files, Locale locale) throws Exception {
|
||||
|
@ -174,6 +174,24 @@ public enum EpisodeMetrics implements SimilarityMetric {
|
||||
}
|
||||
}),
|
||||
|
||||
// Match via common word sequence in episode name and file name
|
||||
SubstringSequence(new SequenceMatchSimilarity() {
|
||||
|
||||
@Override
|
||||
public float getSimilarity(Object o1, Object o2) {
|
||||
// normalize absolute similarity to similarity rank (5 ranks in total),
|
||||
// so we are less likely to fall for false positives in this pass, and move on to the next one
|
||||
return (float) (floor(super.getSimilarity(o1, o2) * 5) / 5);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected String normalize(Object object) {
|
||||
// simplify file name, if possible
|
||||
return normalizeObject(object);
|
||||
}
|
||||
}),
|
||||
|
||||
// Match by generic name similarity
|
||||
Name(new NameSimilarityMetric() {
|
||||
|
||||
@ -273,7 +291,15 @@ public enum EpisodeMetrics implements SimilarityMetric {
|
||||
}
|
||||
|
||||
|
||||
private static final Map<Object, String> transformCache = synchronizedMap(new WeakHashMap<Object, String>(64, 4));
|
||||
|
||||
|
||||
protected static String normalizeObject(Object object) {
|
||||
String result = transformCache.get(object);
|
||||
if (result != null) {
|
||||
return result;
|
||||
}
|
||||
|
||||
String name = object.toString();
|
||||
|
||||
// use name without extension
|
||||
@ -289,7 +315,11 @@ public enum EpisodeMetrics implements SimilarityMetric {
|
||||
// remove/normalize special characters
|
||||
name = normalizePunctuation(name);
|
||||
|
||||
return name.toLowerCase();
|
||||
// normalize to lower case
|
||||
name.toLowerCase();
|
||||
|
||||
transformCache.put(object, name);
|
||||
return name;
|
||||
}
|
||||
|
||||
|
||||
@ -301,9 +331,9 @@ public enum EpisodeMetrics implements SimilarityMetric {
|
||||
// 6 pass: divide by generic numeric similarity
|
||||
// 7 pass: resolve remaining collisions via absolute string similarity
|
||||
if (includeFileMetrics) {
|
||||
return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, Name, Numeric, new NameSimilarityMetric() };
|
||||
return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, new MetricCascade(SubstringSequence, Name), Numeric };
|
||||
} else {
|
||||
return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, Name, Numeric, new NameSimilarityMetric() };
|
||||
return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, new MetricCascade(SubstringSequence, Name), Numeric };
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,58 @@
|
||||
|
||||
package net.sourceforge.filebot.similarity;
|
||||
|
||||
|
||||
import static java.lang.Math.*;
|
||||
import static net.sourceforge.filebot.similarity.Normalization.*;
|
||||
|
||||
import java.text.Collator;
|
||||
import java.util.Comparator;
|
||||
import java.util.Locale;
|
||||
|
||||
|
||||
public class SequenceMatchSimilarity implements SimilarityMetric {
|
||||
|
||||
@Override
|
||||
public float getSimilarity(Object o1, Object o2) {
|
||||
String s1 = normalize(o1);
|
||||
String s2 = normalize(o2);
|
||||
|
||||
// match common word sequence
|
||||
String match = match(s1, s2);
|
||||
if (match == null)
|
||||
return 0;
|
||||
|
||||
return (float) match.length() / min(s1.length(), s2.length());
|
||||
}
|
||||
|
||||
|
||||
protected String normalize(Object object) {
|
||||
// use string representation
|
||||
String name = object.toString();
|
||||
|
||||
// normalize separators
|
||||
name = normalizePunctuation(name);
|
||||
|
||||
// normalize case and trim
|
||||
return name.trim().toLowerCase();
|
||||
}
|
||||
|
||||
|
||||
protected String match(String s1, String s2) {
|
||||
// use maximum strength collator by default
|
||||
Collator collator = Collator.getInstance(Locale.ROOT);
|
||||
collator.setDecomposition(Collator.FULL_DECOMPOSITION);
|
||||
collator.setStrength(Collator.TERTIARY);
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
SeriesNameMatcher matcher = new SeriesNameMatcher((Comparator) collator, 10) {
|
||||
|
||||
@Override
|
||||
protected String normalize(String name) {
|
||||
return name; // assume normalization has been done, no need to do that here again
|
||||
};
|
||||
};
|
||||
|
||||
return matcher.matchByFirstCommonWordSequence(s1, s2);
|
||||
}
|
||||
}
|
@ -33,17 +33,23 @@ public class SeriesNameMatcher {
|
||||
protected SeasonEpisodeMatcher seasonEpisodeMatcher = new SeasonEpisodeMatcher(new SeasonEpisodeFilter(30, 50, -1), true);
|
||||
protected NameSimilarityMetric nameSimilarityMetric = new NameSimilarityMetric();
|
||||
|
||||
protected int commonWordSequenceMaxStartIndex = 3;
|
||||
protected int commonWordSequenceMaxStartIndex;
|
||||
protected Comparator<String> commonWordComparator;
|
||||
|
||||
|
||||
public SeriesNameMatcher() {
|
||||
this(String.CASE_INSENSITIVE_ORDER);
|
||||
this(String.CASE_INSENSITIVE_ORDER, 3);
|
||||
}
|
||||
|
||||
|
||||
public SeriesNameMatcher(Comparator<String> comparator) {
|
||||
this.commonWordComparator = comparator;
|
||||
this(comparator, 3);
|
||||
}
|
||||
|
||||
|
||||
public SeriesNameMatcher(Comparator<String> commonWordComparator, int commonWordSequenceMaxStartIndex) {
|
||||
this.commonWordSequenceMaxStartIndex = commonWordSequenceMaxStartIndex;
|
||||
this.commonWordComparator = commonWordComparator;
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user