refactor number parsing

This commit is contained in:
Reinhard Pointner 2016-01-10 04:54:35 +00:00
parent 49561dd944
commit 9cc353e981
12 changed files with 87 additions and 108 deletions

View File

@ -1,11 +1,12 @@
package net.filebot.archive;
import static net.filebot.util.StringUtilities.*;
import java.io.Closeable;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
@ -94,8 +95,8 @@ public class Archive implements Closeable {
Matcher matcher = volume.matcher(path.getName());
if (matcher.find()) {
Scanner scanner = new Scanner(matcher.group()).useDelimiter("\\D+");
if (!scanner.hasNext() || scanner.nextInt() != 1) {
Integer i = matchInteger(matcher.group());
if (i == null || i != 1) {
return false;
}
}

View File

@ -401,7 +401,7 @@ public class MediaDetection {
for (File path : listPathTail(f, 2, true)) {
String fn = getName(path);
// ignore non-strict series name parsing if there are movie year patterns
if (!strict && parseMovieYear(fn).equals(parseNumbers(fn))) {
if (!strict && parseMovieYear(fn).equals(matchIntegers(fn))) {
break;
}
String sn = seriesNameMatcher.matchByEpisodeIdentifier(fn);
@ -763,27 +763,7 @@ public class MediaDetection {
}
public static List<Integer> parseMovieYear(String name) {
List<Integer> years = new ArrayList<Integer>();
for (String it : name.split("\\D+")) {
if (it.length() == 4) {
int year = Integer.parseInt(it);
if (1950 < year && year < 2050) {
years.add(year);
}
}
}
return years;
}
public static List<Integer> parseNumbers(String name) {
List<Integer> numbers = new ArrayList<Integer>();
for (String it : name.split("\\D+")) {
if (it.length() > 0) {
int n = Integer.parseInt(it);
numbers.add(n);
}
}
return numbers;
return matchIntegers(name).stream().filter(year -> 1950 < year && year < 2050).collect(Collectors.toList());
}
public static String reduceMovieName(String name, boolean strict) throws IOException {

View File

@ -1,6 +1,7 @@
package net.filebot.similarity;
import static java.lang.Math.*;
import static java.util.Arrays.*;
import static java.util.Collections.*;
import static java.util.regex.Pattern.*;
import static net.filebot.media.MediaDetection.*;
@ -19,7 +20,6 @@ import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
@ -451,11 +451,7 @@ public enum EpisodeMetrics implements SimilarityMetric {
}
// simplify file name if possible and extract numbers
List<String> numbers = new ArrayList<String>(4);
Scanner scanner = new Scanner(normalizeObject(object)).useDelimiter("\\D+");
while (scanner.hasNextInt()) {
numbers.add(String.valueOf(scanner.nextInt()));
}
List<Integer> numbers = matchIntegers(normalizeObject(object));
return join(numbers, " ");
}
}),

View File

@ -1,11 +1,10 @@
package net.filebot.similarity;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
@ -13,78 +12,65 @@ import uk.ac.shef.wit.simmetrics.tokenisers.InterfaceTokeniser;
import uk.ac.shef.wit.simmetrics.wordhandlers.DummyStopTermHandler;
import uk.ac.shef.wit.simmetrics.wordhandlers.InterfaceTermHandler;
public class NumericSimilarityMetric implements SimilarityMetric {
private final AbstractStringMetric metric;
public NumericSimilarityMetric() {
// I don't exactly know why, but I get a good matching behavior
// when using QGramsDistance or BlockDistance
metric = new QGramsDistance(new NumberTokeniser());
}
@Override
public float getSimilarity(Object o1, Object o2) {
return metric.getSimilarity(normalize(o1), normalize(o2));
}
protected String normalize(Object object) {
// no need to do anything special here, because we don't care about anything but number patterns anyway
return object.toString();
}
private static class NumberTokeniser implements InterfaceTokeniser {
private final String delimiter = "\\D+";
private static final Pattern DIGIT = Pattern.compile("\\d+");
@Override
public ArrayList<String> tokenizeToArrayList(String input) {
public ArrayList<String> tokenizeToArrayList(String s) {
ArrayList<String> tokens = new ArrayList<String>();
// scan for number patterns, use non-number pattern as delimiter
Scanner scanner = new Scanner(input).useDelimiter(delimiter);
while (scanner.hasNextInt()) {
// remove leading zeros from number tokens by scanning for Integers
tokens.add(String.valueOf(scanner.nextInt()));
Matcher m = DIGIT.matcher(s);
while (m.find()) {
// remove leading zeros
tokens.add(new Integer(m.group()).toString());
}
return tokens;
}
@Override
public String getDelimiters() {
return "\\D+";
}
@Override
public Set<String> tokenizeToSet(String input) {
return new LinkedHashSet<String>(tokenizeToArrayList(input));
}
@Override
public String getShortDescriptionString() {
return getClass().getSimpleName();
}
@Override
public String getDelimiters() {
return delimiter;
}
private InterfaceTermHandler stopWordHandler = new DummyStopTermHandler();
@Override
public InterfaceTermHandler getStopWordHandler() {
return stopWordHandler;
}
@Override
public void setStopWordHandler(InterfaceTermHandler stopWordHandler) {
this.stopWordHandler = stopWordHandler;

View File

@ -3,6 +3,7 @@ package net.filebot.similarity;
import static java.util.Collections.*;
import static java.util.regex.Pattern.*;
import static net.filebot.util.FileUtilities.*;
import static net.filebot.util.StringUtilities.*;
import java.io.File;
import java.util.ArrayList;
@ -10,7 +11,6 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
@ -54,9 +54,9 @@ public class SeasonEpisodeMatcher {
@Override
protected Collection<SxE> process(MatchResult match) {
List<SxE> matches = new ArrayList<SxE>(2);
Scanner epno = new Scanner(match.group(2)).useDelimiter("\\D+");
while (epno.hasNext()) {
matches.add(new SxE(match.group(1), epno.next()));
int seasonNumber = Integer.parseInt(match.group(1));
for (int episodeNumber : matchIntegers(match.group(2))) {
matches.add(new SxE(seasonNumber, episodeNumber));
}
return matches;
}
@ -68,9 +68,9 @@ public class SeasonEpisodeMatcher {
@Override
protected Collection<SxE> process(MatchResult match) {
List<SxE> matches = new ArrayList<SxE>(2);
String[] num = match.group(0).split("\\D+");
for (int i = 0; i < num.length; i += 2) {
matches.add(new SxE(num[i], num[i + 1])); // SxE-SxE-SxE
String[] numbers = NON_DIGIT.split(match.group(0));
for (int i = 0; i < numbers.length; i += 2) {
matches.add(new SxE(numbers[i], numbers[i + 1])); // SxE-SxE-SxE
}
return matches;
}
@ -82,9 +82,9 @@ public class SeasonEpisodeMatcher {
@Override
protected Collection<SxE> process(MatchResult match) {
List<SxE> matches = new ArrayList<SxE>(2);
Scanner epno = new Scanner(match.group(2)).useDelimiter("\\D+");
while (epno.hasNext()) {
matches.add(new SxE(match.group(1), epno.next()));
int seasonNumber = Integer.parseInt(match.group(1));
for (int episodeNumber : matchIntegers(match.group(2))) {
matches.add(new SxE(seasonNumber, episodeNumber));
}
return matches;
}
@ -96,9 +96,9 @@ public class SeasonEpisodeMatcher {
@Override
protected Collection<SxE> process(MatchResult match) {
List<SxE> matches = new ArrayList<SxE>(2);
Scanner epno = new Scanner(match.group(2)).useDelimiter("\\D+");
while (epno.hasNext()) {
matches.add(new SxE(match.group(1), epno.next()));
int seasonNumber = Integer.parseInt(match.group(1));
for (int episodeNumber : matchIntegers(match.group(2))) {
matches.add(new SxE(seasonNumber, episodeNumber));
}
return matches;
}

View File

@ -1,12 +1,44 @@
package net.filebot.util;
import static java.util.Arrays.*;
import static java.util.Collections.*;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public final class StringUtilities {
public static final Pattern DIGIT = Pattern.compile("\\d+");
public static final Pattern NON_DIGIT = Pattern.compile("\\D+");
public static List<Integer> matchIntegers(CharSequence s) {
if (s == null || s.length() == 0) {
return emptyList();
}
List<Integer> numbers = new ArrayList<Integer>();
Matcher matcher = DIGIT.matcher(s);
while (matcher.find()) {
numbers.add(new Integer(matcher.group()));
}
return numbers;
}
public static Integer matchInteger(CharSequence s) {
if (s == null || s.length() == 0) {
return null;
}
Matcher matcher = DIGIT.matcher(s);
if (matcher.find()) {
return new Integer(matcher.group());
}
return null;
}
public static String asString(Object object) {
return object == null ? null : object.toString();
}
@ -28,7 +60,7 @@ public final class StringUtilities {
}
public static String joinSorted(Object[] values, CharSequence delimiter, Comparator<Object> sort, CharSequence start, CharSequence end) {
return join(Arrays.stream(values).sorted(sort)::iterator, delimiter, start, end);
return join(stream(values).sorted(sort)::iterator, delimiter, start, end);
}
public static String join(Iterable<?> values, CharSequence delimiter, CharSequence start, CharSequence end) {

View File

@ -3,8 +3,6 @@ package net.filebot.util;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Scanner;
import javax.xml.namespace.QName;
import javax.xml.xpath.XPathConstants;
@ -126,14 +124,6 @@ public final class XPathUtilities {
return list;
}
public static Integer getInteger(String textContent) {
try {
return new Scanner(textContent).useDelimiter("\\D+").nextInt();
} catch (NumberFormatException | NoSuchElementException | NullPointerException e) {
return null;
}
}
public static Double getDecimal(String textContent) {
try {
return new Double(textContent);

View File

@ -1,6 +1,7 @@
package net.filebot.web;
import static java.util.Collections.*;
import static net.filebot.util.StringUtilities.*;
import static net.filebot.util.XPathUtilities.*;
import static net.filebot.web.EpisodeUtilities.*;
import static net.filebot.web.WebRequest.*;
@ -123,7 +124,7 @@ public class AnidbClient extends AbstractEpisodeListProvider {
seriesInfo.setName(selectString("anime/titles/title[@type='main']", dom));
seriesInfo.setRating(getDecimal(selectString("anime/ratings/permanent", dom)));
seriesInfo.setRatingCount(getInteger(getTextContent("anime/ratings/permanent/@count", dom)));
seriesInfo.setRatingCount(matchInteger(getTextContent("anime/ratings/permanent/@count", dom)));
seriesInfo.setStartDate(SimpleDate.parse(selectString("anime/startdate", dom), "yyyy-MM-dd"));
// add categories ordered by weight as genres
@ -132,7 +133,7 @@ public class AnidbClient extends AbstractEpisodeListProvider {
// * limit to 5 genres
seriesInfo.setGenres(selectNodes("anime/categories/category", dom).stream().map(categoryNode -> {
String name = getTextContent("name", categoryNode);
Integer weight = getInteger(getAttribute("weight", categoryNode));
Integer weight = matchInteger(getAttribute("weight", categoryNode));
return new SimpleImmutableEntry<String, Integer>(name, weight);
}).filter(nw -> {
return nw.getKey() != null && nw.getValue() != null && nw.getKey().length() > 0 && nw.getValue() >= 400;

View File

@ -1,12 +1,12 @@
package net.filebot.web;
import static net.filebot.util.StringUtilities.*;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
@ -83,15 +83,7 @@ public class ID3Lookup implements MusicIdentificationService {
}
private Integer getInteger(MediaInfo mediaInfo, String field) {
String value = getString(mediaInfo, field);
if (value != null) {
try {
return new Scanner(value).useDelimiter("\\D+").nextInt();
} catch (NumberFormatException | NoSuchElementException e) {
Logger.getLogger(ID3Lookup.class.getName()).log(Level.WARNING, e.toString());
}
}
return null;
return matchInteger(getString(mediaInfo, field));
}
}

View File

@ -1,6 +1,7 @@
package net.filebot.web;
import static java.util.Collections.*;
import static net.filebot.util.StringUtilities.*;
import static net.filebot.web.WebRequest.*;
import java.io.File;
@ -17,7 +18,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
@ -123,7 +123,7 @@ public class OMDbClient implements MovieIdentificationService {
public Movie getMovie(Map<String, String> info) {
try {
String name = info.get("Title");
int year = new Scanner(info.get("Year")).useDelimiter("\\D+").nextInt();
int year = matchInteger(info.get("Year"));
int imdbid = Integer.parseInt(info.get("imdbID").replace("tt", ""));
if (name.length() <= 0 || year <= 1900 || imdbid <= 0)

View File

@ -2,6 +2,7 @@ package net.filebot.web;
import static java.util.Arrays.*;
import static java.util.Collections.*;
import static net.filebot.util.StringUtilities.*;
import static net.filebot.web.WebRequest.*;
import java.io.File;
@ -22,7 +23,6 @@ import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.TimeUnit;
@ -110,7 +110,7 @@ public class TMDbClient implements MovieIdentificationService {
id = Float.valueOf(it.get("id").toString()).intValue();
try {
String release = (String) it.get("release_date");
year = new Scanner(release).useDelimiter("\\D+").nextInt();
year = matchInteger(release);
} catch (Exception e) {
throw new IllegalArgumentException("Missing data: release date");
}

View File

@ -2,6 +2,7 @@ package net.filebot.web;
import static java.util.Arrays.*;
import static java.util.Collections.*;
import static net.filebot.util.StringUtilities.*;
import static net.filebot.util.XPathUtilities.*;
import static net.filebot.web.EpisodeUtilities.*;
import static net.filebot.web.WebRequest.*;
@ -104,7 +105,7 @@ public class TheTVDBClient extends AbstractEpisodeListProvider {
Map<Integer, TheTVDBSearchResult> resultSet = new LinkedHashMap<Integer, TheTVDBSearchResult>();
for (Node node : nodes) {
int sid = getInteger(getTextContent("seriesid", node));
int sid = matchInteger(getTextContent("seriesid", node));
String seriesName = getTextContent("SeriesName", node);
List<String> aliasNames = new ArrayList<String>();
@ -144,8 +145,8 @@ public class TheTVDBClient extends AbstractEpisodeListProvider {
seriesInfo.setStatus(getTextContent("Status", seriesNode));
seriesInfo.setRating(getDecimal(getTextContent("Rating", seriesNode)));
seriesInfo.setRatingCount(getInteger(getTextContent("RatingCount", seriesNode)));
seriesInfo.setRuntime(getInteger(getTextContent("Runtime", seriesNode)));
seriesInfo.setRatingCount(matchInteger(getTextContent("RatingCount", seriesNode)));
seriesInfo.setRuntime(matchInteger(getTextContent("Runtime", seriesNode)));
seriesInfo.setActors(getListContent("Actors", "\\|", seriesNode));
seriesInfo.setGenres(getListContent("Genre", "\\|", seriesNode));
seriesInfo.setStartDate(SimpleDate.parse(getTextContent("FirstAired", seriesNode), "yyyy-MM-dd"));
@ -162,17 +163,17 @@ public class TheTVDBClient extends AbstractEpisodeListProvider {
for (Node node : nodes) {
String episodeName = getTextContent("EpisodeName", node);
Integer absoluteNumber = getInteger(getTextContent("absolute_number", node));
Integer absoluteNumber = matchInteger(getTextContent("absolute_number", node));
SimpleDate airdate = SimpleDate.parse(getTextContent("FirstAired", node), "yyyy-MM-dd");
// default numbering
Integer episodeNumber = getInteger(getTextContent("EpisodeNumber", node));
Integer seasonNumber = getInteger(getTextContent("SeasonNumber", node));
Integer episodeNumber = matchInteger(getTextContent("EpisodeNumber", node));
Integer seasonNumber = matchInteger(getTextContent("SeasonNumber", node));
// adjust for DVD numbering if possible
if (sortOrder == SortOrder.DVD) {
Integer dvdSeasonNumber = getInteger(getTextContent("DVD_season", node));
Integer dvdEpisodeNumber = getInteger(getTextContent("DVD_episodenumber", node));
Integer dvdSeasonNumber = matchInteger(getTextContent("DVD_season", node));
Integer dvdEpisodeNumber = matchInteger(getTextContent("DVD_episodenumber", node));
// require both values to be valid integer numbers
if (dvdSeasonNumber != null && dvdEpisodeNumber != null) {
@ -185,7 +186,7 @@ public class TheTVDBClient extends AbstractEpisodeListProvider {
if (seasonNumber == null || seasonNumber == 0) {
// handle as special episode
for (String specialSeasonTag : new String[] { "airsafter_season", "airsbefore_season" }) {
Integer specialSeason = getInteger(getTextContent(specialSeasonTag, node));
Integer specialSeason = matchInteger(getTextContent(specialSeasonTag, node));
if (specialSeason != null && specialSeason != 0) {
seasonNumber = specialSeason;
break;