filebot/source/net/filebot/similarity/Normalization.java

127 lines
3.9 KiB
Java
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package net.filebot.similarity;
import static java.util.regex.Pattern.*;
import static net.filebot.util.RegularExpressions.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Normalization {
public static final Pattern APOSTROPHE = compile("['`´ʻ]+");
public static final Pattern PUNCTUATION_OR_SPACE = compile("[\\p{Punct}\\p{Space}]+", UNICODE_CHARACTER_CLASS);
public static final Pattern WORD_SEPARATOR_PUNCTUATION = compile("[:?._]");
public static final Pattern TRAILING_PARENTHESIS = compile("(?<!^)[(]([^)]*)[)]$");
public static final Pattern TRAILING_PUNCTUATION = compile("[!?.]+$");
public static final Pattern EMBEDDED_CHECKSUM = compile("[\\(\\[](\\p{XDigit}{8})[\\]\\)]");
private static final Pattern[] BRACKETS = new Pattern[] { compile("\\([^\\(]*\\)"), compile("\\[[^\\[]*\\]"), compile("\\{[^\\{]*\\}") };
// ' and " all characters that are more or less equivalent
private static final char[][] QUOTES = { { '\'', '\u0060', '\u00b4', '\u2018', '\u2019', '\u02bb' }, { '\"', '\u201c', '\u201d' } };
public static String normalizeQuotationMarks(String name) {
for (char[] cs : QUOTES) {
for (char c : cs) {
name = name.replace(c, cs[0]);
}
}
return name;
}
public static String trimTrailingPunctuation(CharSequence name) {
return normalize(name, TRAILING_PUNCTUATION, "");
}
public static String normalizePunctuation(String name) {
return normalizePunctuation(name, "", " ");
}
public static String normalizePunctuation(String name, String apostrophe, String space) {
// remove/normalize special characters
Pattern[] pattern = { APOSTROPHE, PUNCTUATION_OR_SPACE };
String[] replacement = { apostrophe, space };
return normalize(name, pattern, replacement);
}
public static String normalizeBrackets(String name) {
// remove group names and checksums, any [...] or (...)
return normalize(name, BRACKETS, " ");
}
public static String normalizeSpace(String name, String space) {
Pattern[] patterns = { WORD_SEPARATOR_PUNCTUATION, SPACE };
String[] replacements = { " ", space };
return normalize(name, patterns, replacements);
}
public static String replaceSpace(CharSequence name, String replacement) {
return normalize(name, SPACE, replacement);
}
public static String replaceColon(String name, String ratio, String colon) {
Pattern[] pattern = { RATIO, COLON };
String[] replacement = { ratio, colon };
return normalize(name, pattern, replacement);
}
public static String getEmbeddedChecksum(CharSequence name) {
Matcher m = EMBEDDED_CHECKSUM.matcher(name);
if (m.find()) {
return m.group(1);
}
return null;
}
public static String removeEmbeddedChecksum(CharSequence name) {
// match embedded checksum and surrounding brackets
return normalize(name, EMBEDDED_CHECKSUM, "");
}
public static String removeTrailingBrackets(CharSequence name) {
// remove trailing braces, e.g. Doctor Who (2005) -> Doctor Who
return normalize(name, TRAILING_PARENTHESIS, "");
}
private static String normalize(CharSequence name, Pattern pattern, String replacement) {
return pattern.matcher(name).replaceAll(Matcher.quoteReplacement(replacement)).trim();
}
private static String normalize(String name, Pattern[] pattern, String replacement) {
for (int i = 0; i < pattern.length; i++) {
name = normalize(name, pattern[i], replacement);
}
return name;
}
private static String normalize(String name, Pattern[] pattern, String[] replacement) {
for (int i = 0; i < pattern.length; i++) {
name = normalize(name, pattern[i], replacement[i]);
}
return name;
}
public static String truncateText(String title, int limit) {
if (title == null || title.length() < limit) {
return title;
}
String[] words = SPACE.split(title);
StringBuilder s = new StringBuilder();
for (int i = 0; i < words.length && s.length() + words[i].length() < limit; i++) {
if (i > 0) {
s.append(' ');
}
s.append(words[i]);
}
return s.toString().trim();
}
}