* fine-tune generic numberic sequence matching (e.g. Bones Staffel 1 Folge 5)

This commit is contained in:
Reinhard Pointner 2013-02-15 09:50:23 +00:00
parent 1e06994a59
commit 24f9b8d92a
8 changed files with 75 additions and 18 deletions

View File

@ -8,11 +8,15 @@ import static java.util.Collections.*;
import static net.sourceforge.filebot.Settings.*;
import static net.sourceforge.filebot.similarity.Normalization.*;
import static net.sourceforge.tuned.FileUtilities.*;
import static net.sourceforge.tuned.StringUtilities.*;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import net.sourceforge.filebot.similarity.SeasonEpisodeMatcher.SxE;
import net.sourceforge.filebot.vfs.FileInfo;
@ -187,7 +191,7 @@ public enum EpisodeMetrics implements SimilarityMetric {
}),
// Match via common word sequence in episode name and file name
SubstringSequence(new SequenceMatchSimilarity() {
NameSubstringSequence(new SequenceMatchSimilarity() {
@Override
public float getSimilarity(Object o1, Object o2) {
@ -199,12 +203,19 @@ public enum EpisodeMetrics implements SimilarityMetric {
@Override
protected String normalize(Object object) {
if (object instanceof Episode) {
object = removeTrailingBrackets(((Episode) object).getSeriesName());
} else if (object instanceof Movie) {
object = ((Movie) object).getName();
} else if (object instanceof File) {
object = getNameWithoutExtension(getRelativePathTail((File) object, 3).getPath());
}
// simplify file name, if possible
return normalizeObject(object);
}
}),
// Match by generic name similarity
// Match by generic name similarity (round rank)
Name(new NameSimilarityMetric() {
@Override
@ -222,12 +233,33 @@ public enum EpisodeMetrics implements SimilarityMetric {
}
}),
NumericSequence(new SequenceMatchSimilarity() {
// Match by generic name similarity (absolute)
AbsoluteName(new NameSimilarityMetric() {
@Override
protected String normalize(Object object) {
// simplify file name, if possible
return normalizeObject(object).replaceAll("\\D+", " ").trim();
return normalizeObject(object);
}
}),
NumericSequence(new SequenceMatchSimilarity() {
@Override
protected String normalize(Object object) {
if (object instanceof Episode) {
object = EpisodeFormat.SeasonEpisode.formatSxE((Episode) object);
} else if (object instanceof Movie) {
object = ((Movie) object).getYear();
}
// simplify file name if possible and extract numbers
List<Integer> numbers = new ArrayList<Integer>(4);
Scanner scanner = new Scanner(normalizeObject(object)).useDelimiter("\\D+");
while (scanner.hasNextInt()) {
numbers.add(scanner.nextInt());
}
return join(numbers, " ");
}
}),
@ -411,9 +443,9 @@ public enum EpisodeMetrics implements SimilarityMetric {
// 7 pass: prefer episodes that were aired closer to the last modified date of the file
// 8 pass: resolve remaining collisions via absolute string similarity
if (includeFileMetrics) {
return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, MetaAttributes, new MetricCascade(SubstringSequence, Name), Numeric, NumericSequence, Name, TimeStamp, new NameSimilarityMetric() };
return new SimilarityMetric[] { FileSize, new MetricCascade(FileName, EpisodeFunnel), EpisodeBalancer, SubstringFields, MetaAttributes, new MetricCascade(NameSubstringSequence, Name), Numeric, NumericSequence, Name, TimeStamp, AbsoluteName };
} else {
return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, MetaAttributes, new MetricCascade(SubstringSequence, Name), Numeric, NumericSequence, Name, TimeStamp, new NameSimilarityMetric() };
return new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringFields, MetaAttributes, new MetricCascade(NameSubstringSequence, Name), Numeric, NumericSequence, Name, TimeStamp, AbsoluteName };
}
}

View File

@ -148,13 +148,14 @@ public class Matcher<V, C> {
for (Match<V, C> possibleMatch : possibleMatches) {
float similarity = metric.getSimilarity(possibleMatch.getValue(), possibleMatch.getCandidate());
Set<Match<V, C>> matchSet = similarityMap.get(similarity);
// DEBUG
// System.out.format("%s: %.04f: %s%n", metric, similarity, possibleMatch);
Set<Match<V, C>> matchSet = similarityMap.get(similarity);
if (matchSet == null) {
matchSet = new LinkedHashSet<Match<V, C>>();
similarityMap.put(similarity, matchSet);
}
matchSet.add(possibleMatch);
// unwind this thread if we have been interrupted

View File

@ -24,6 +24,11 @@ public class SequenceMatchSimilarity implements SimilarityMetric {
if (match == null)
return 0;
return similarity(match, s1, s2);
}
protected float similarity(String match, String s1, String s2) {
return (float) match.length() / min(s1.length(), s2.length());
}

View File

@ -47,7 +47,7 @@ public final class SubtitleUtilities {
Map<File, SubtitleDescriptor> subtitleByVideo = new LinkedHashMap<File, SubtitleDescriptor>();
// optimize for generic media <-> subtitle matching
SimilarityMetric[] metrics = new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, SubstringSequence, new MetricCascade(SubstringSequence, Name), Numeric, new NameSimilarityMetric() };
SimilarityMetric[] metrics = new SimilarityMetric[] { EpisodeFunnel, EpisodeBalancer, NameSubstringSequence, new MetricCascade(NameSubstringSequence, Name), Numeric, new NameSimilarityMetric() };
// subtitle verification metric specifically excluding SxE mismatches
SimilarityMetric absoluteSeasonEpisode = new SimilarityMetric() {
@ -61,7 +61,7 @@ public final class SubtitleUtilities {
return f < 1 ? -1 : 1;
}
};
SimilarityMetric sanity = new MetricCascade(absoluteSeasonEpisode, AirDate, new MetricAvg(SubstringSequence, Name));
SimilarityMetric sanity = new MetricCascade(absoluteSeasonEpisode, AirDate, new MetricAvg(NameSubstringSequence, Name));
// first match everything as best as possible, then filter possibly bad matches
Matcher<File, SubtitleDescriptor> matcher = new Matcher<File, SubtitleDescriptor>(files, subtitles, false, metrics);

View File

@ -352,9 +352,15 @@ public final class FileUtilities {
public static List<File> listPath(File file) {
return listPathTail(file, Integer.MAX_VALUE);
}
public static List<File> listPathTail(File file, int tailSize) {
LinkedList<File> nodes = new LinkedList<File>();
for (File node = file; node != null && !UNC_PREFIX.equals(node.toString()); node = node.getParentFile()) {
File node = file;
for (int i = 0; node != null && i < tailSize && !UNC_PREFIX.equals(node.toString()); i++, node = node.getParentFile()) {
nodes.addFirst(node);
}
@ -362,6 +368,17 @@ public final class FileUtilities {
}
public static File getRelativePathTail(File file, int tailSize) {
File f = null;
for (File it : listPathTail(file, tailSize)) {
if (it.getParentFile() != null) {
f = new File(f, it.getName());
}
}
return f;
}
public static List<File> listFiles(Iterable<File> folders, int maxDepth, boolean listHiddenFiles) {
List<File> files = new ArrayList<File>();

View File

@ -123,6 +123,7 @@ Hard.Subbed
HBO
hd
HDRip
Hi10P
Hindi
History.Channel
HQ

View File

@ -1,2 +1,3 @@
HIMYM How I Met your Mother
Hml8p Homeland
Hml8p Homeland
NCIS.LA NCIS: Los Angeles

View File

@ -3,8 +3,8 @@ def input = []
def failOnError = _args.conflict == 'fail'
// print input parameters
_args.bindings?.each{ _log.finest("Parameter: $it.key = $it.value") }
args.each{ _log.finest("Argument: $it") }
_args.bindings?.each{ _log.fine("Parameter: $it.key = $it.value") }
args.each{ _log.fine("Argument: $it") }
args.findAll{ !it.exists() }.each{ throw new Exception("File not found: $it") }
// check user-defined pre-condition
@ -34,7 +34,7 @@ def format = [
tvs: tryQuietly{ seriesFormat } ?: '''TV Shows/{n}/{episode.special ? "Special" : "Season "+s}/{n} - {episode.special ? "S00E"+special.pad(2) : s00e00} - {t}{".$lang"}''',
anime: tryQuietly{ animeFormat } ?: '''Anime/{n}/{n} - {sxe} - {t}''',
mov: tryQuietly{ movieFormat } ?: '''Movies/{n} ({y})/{n} ({y}){" CD$pi"}{".$lang"}''',
music: tryQuietly{ musicFormat } ?: '''Music/{n}/{album}/{n} - {t}'''
music: tryQuietly{ musicFormat } ?: '''Music/{n}/{album+'/'}{pi.pad(2)+'. '}{artist} - {t}'''
]
@ -117,7 +117,7 @@ def groups = input.groupBy{ f ->
def tvs = detectSeriesName(f)
def mov = detectMovie(f, false)
println "$f.name [series: $tvs, movie: $mov]"
_log.fine("$f.name [series: $tvs, movie: $mov]")
// DECIDE EPISODE VS MOVIE (IF NOT CLEAR)
if (tvs && mov) {
@ -129,10 +129,10 @@ def groups = input.groupBy{ f ->
// S00E00 | 2012.07.21 | One Piece 217 | Firefly - Serenity | [Taken 1, Taken 2, Taken 3, Taken 4, ..., Taken 10]
if (parseEpisodeNumber(fn, true) || parseDate(fn) || (fn =~ sn && parseEpisodeNumber(fn.after(sn), false)) || fn.after(sn) =~ / - .+/ || f.dir.listFiles{ it.isVideo() && norm(it.name) =~ sn && it.name =~ /\b\d{1,3}\b/}.size() >= 10) {
println "Exclude Movie: $mov"
_log.fine("Exclude Movie: $mov")
mov = null
} else if ((detectMovie(f, true) && [dn, fn].find{ it =~ /(19|20)\d{2}/ }) || [dn, fn].find{ it =~ mn && !(it.after(mn) =~ /\b\d{1,3}\b/) }) {
println "Exclude Series: $tvs"
_log.fine("Exclude Series: $tvs")
tvs = null
}
}