From 5d5902cdfb2b5e606681771b432a1aa13df90959 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Wed, 15 Feb 2012 06:12:09 +0000 Subject: [PATCH] * improved international aspects of detection/matching --- .../filebot/media/MediaDetection.java | 2 +- .../filebot/media/ReleaseInfo.java | 90 +++++++++++++------ .../filebot/ui/rename/FormatDialog.properties | 2 +- 3 files changed, 65 insertions(+), 29 deletions(-) diff --git a/source/net/sourceforge/filebot/media/MediaDetection.java b/source/net/sourceforge/filebot/media/MediaDetection.java index 3b521578..63e3257f 100644 --- a/source/net/sourceforge/filebot/media/MediaDetection.java +++ b/source/net/sourceforge/filebot/media/MediaDetection.java @@ -458,7 +458,7 @@ public class MediaDetection { // use maximum strength collator by default final Collator collator = Collator.getInstance(locale); collator.setDecomposition(Collator.FULL_DECOMPOSITION); - collator.setStrength(Collator.TERTIARY); + collator.setStrength(Collator.PRIMARY); return (Comparator) collator; } diff --git a/source/net/sourceforge/filebot/media/ReleaseInfo.java b/source/net/sourceforge/filebot/media/ReleaseInfo.java index 3492998a..d3d18e33 100644 --- a/source/net/sourceforge/filebot/media/ReleaseInfo.java +++ b/source/net/sourceforge/filebot/media/ReleaseInfo.java @@ -2,6 +2,7 @@ package net.sourceforge.filebot.media; +import static java.util.Arrays.*; import static java.util.ResourceBundle.*; import static java.util.regex.Pattern.*; import static net.sourceforge.filebot.similarity.Normalization.*; @@ -12,11 +13,18 @@ import java.io.FileFilter; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.Charset; +import java.text.Collator; +import java.text.Normalizer; +import java.text.Normalizer.Form; import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Scanner; +import java.util.Set; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -44,11 +52,12 @@ public class ReleaseInfo { public Locale getLanguageSuffix(String name) { // match locale identifier and lookup Locale object - String lang = matchLast(getLanguageSuffixPattern(), null, name); + Map languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()); + String lang = matchLast(getLanguageSuffixPattern(languages.keySet()), null, name); if (lang == null) return null; - return getLanguageMap(Locale.ENGLISH, Locale.getDefault()).get(lang); + return languages.get(lang); } @@ -80,12 +89,14 @@ public class ReleaseInfo { public List cleanRelease(Iterable items, boolean strict) throws IOException { - return clean(items, getReleaseGroupPattern(strict), getLanguageSuffixPattern(), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(false)); + Set languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet(); + return clean(items, getReleaseGroupPattern(strict), getLanguageSuffixPattern(languages), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(), getLanguageOptionPattern(languages)); } public String cleanRelease(String item, boolean strict) throws IOException { - return clean(item, getReleaseGroupPattern(strict), getLanguageSuffixPattern(), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(false)); + Set languages = getLanguageMap(Locale.ENGLISH, Locale.getDefault()).keySet(); + return clean(item, getReleaseGroupPattern(strict), getLanguageSuffixPattern(languages), getVideoSourcePattern(), getVideoFormatPattern(), getResolutionPattern(), getBlacklistPattern(), getLanguageOptionPattern(languages)); } @@ -111,30 +122,15 @@ public class ReleaseInfo { } - public Pattern getLanguageSuffixPattern() { - // .{language}[.srt] - return compile("(?<=\\p{Punct}|\\s)(" + join(getLanguageMap(Locale.ENGLISH).keySet(), "|") + ")(?=$)", CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ); + public Pattern getLanguageOptionPattern(Collection languages) { + // [en] + return compile("(?<=[-\\[{(])(" + join(quoteAll(languages), "|") + ")(?=\\p{Punct})", CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ); } - public Map getLanguageMap(Locale... supportedLanguageName) { - Map languageMap = new TreeMap(String.CASE_INSENSITIVE_ORDER); - - for (String code : Locale.getISOLanguages()) { - Locale locale = new Locale(code); - languageMap.put(locale.getLanguage(), locale); - languageMap.put(locale.getISO3Language(), locale); - - // map display language names for given locales - for (Locale language : supportedLanguageName) { - languageMap.put(locale.getDisplayLanguage(language), locale); - } - } - - // remove illegal tokens - languageMap.remove(""); - - return languageMap; + public Pattern getLanguageSuffixPattern(Collection languages) { + // .en.srt + return compile("(?<=\\p{Punct}|\\s)(" + join(quoteAll(languages), "|") + ")(?=$)", CASE_INSENSITIVE | UNICODE_CASE | CANON_EQ); } @@ -164,9 +160,9 @@ public class ReleaseInfo { } - public synchronized Pattern getBlacklistPattern(boolean strict) throws IOException { + public synchronized Pattern getBlacklistPattern() throws IOException { // pattern matching any release group name enclosed in separators - return compile("(? quoteAll(Collection strings) { + List patterns = new ArrayList(strings.size()); + for (String it : strings) { + patterns.add(Pattern.quote(it)); + } + return patterns; + } + + + private Map getLanguageMap(Locale... supportedDisplayLocale) { + // use maximum strength collator by default + Collator collator = Collator.getInstance(Locale.ROOT); + collator.setDecomposition(Collator.FULL_DECOMPOSITION); + collator.setStrength(Collator.PRIMARY); + + @SuppressWarnings("unchecked") + Comparator order = (Comparator) collator; + + Map languageMap = new TreeMap(order); + Set displayLocales = new HashSet(asList(supportedDisplayLocale)); + + for (String code : Locale.getISOLanguages()) { + Locale locale = new Locale(code); + languageMap.put(locale.getLanguage(), locale); + languageMap.put(locale.getISO3Language(), locale); + + // map display language names for given locales + for (Locale language : displayLocales) { + // make sure language name is properly normalized so accents and whatever don't break the regex pattern syntax + String languageName = Normalizer.normalize(locale.getDisplayLanguage(language), Form.NFKD); + languageMap.put(languageName, locale); + } + } + + // remove illegal tokens + languageMap.remove(""); + return languageMap; + } + } diff --git a/source/net/sourceforge/filebot/ui/rename/FormatDialog.properties b/source/net/sourceforge/filebot/ui/rename/FormatDialog.properties index db13fdfb..5051d6ee 100644 --- a/source/net/sourceforge/filebot/ui/rename/FormatDialog.properties +++ b/source/net/sourceforge/filebot/ui/rename/FormatDialog.properties @@ -14,7 +14,7 @@ episode.example[2]: {n} [{airdate}] {t} episode.example[3]: {n.space('.').lower()}.{s}{e.pad(2)} # simple name/year -movie.example[0]: {n} ({y}){" CD$pi"} +movie.example[0]: {n} ({y}){" CD$pi"}{".$lang"} # boxee name movie.example[1]: {n.space('.')}.({y}){".part$pi"} # media info name