From 9cc353e9818c5ad5f39dd0189b4ec0cc0560c740 Mon Sep 17 00:00:00 2001 From: Reinhard Pointner Date: Sun, 10 Jan 2016 04:54:35 +0000 Subject: [PATCH] refactor number parsing --- source/net/filebot/archive/Archive.java | 7 ++-- source/net/filebot/media/MediaDetection.java | 24 +----------- .../filebot/similarity/EpisodeMetrics.java | 8 +--- .../similarity/NumericSimilarityMetric.java | 38 ++++++------------- .../similarity/SeasonEpisodeMatcher.java | 26 ++++++------- source/net/filebot/util/StringUtilities.java | 36 +++++++++++++++++- source/net/filebot/util/XPathUtilities.java | 10 ----- source/net/filebot/web/AnidbClient.java | 5 ++- source/net/filebot/web/ID3Lookup.java | 14 ++----- source/net/filebot/web/OMDbClient.java | 4 +- source/net/filebot/web/TMDbClient.java | 4 +- source/net/filebot/web/TheTVDBClient.java | 19 +++++----- 12 files changed, 87 insertions(+), 108 deletions(-) diff --git a/source/net/filebot/archive/Archive.java b/source/net/filebot/archive/Archive.java index bc15c52b..49c6fafb 100644 --- a/source/net/filebot/archive/Archive.java +++ b/source/net/filebot/archive/Archive.java @@ -1,11 +1,12 @@ package net.filebot.archive; +import static net.filebot.util.StringUtilities.*; + import java.io.Closeable; import java.io.File; import java.io.FileFilter; import java.io.IOException; import java.util.List; -import java.util.Scanner; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; @@ -94,8 +95,8 @@ public class Archive implements Closeable { Matcher matcher = volume.matcher(path.getName()); if (matcher.find()) { - Scanner scanner = new Scanner(matcher.group()).useDelimiter("\\D+"); - if (!scanner.hasNext() || scanner.nextInt() != 1) { + Integer i = matchInteger(matcher.group()); + if (i == null || i != 1) { return false; } } diff --git a/source/net/filebot/media/MediaDetection.java b/source/net/filebot/media/MediaDetection.java index 6ff1123d..a8ce7f11 100644 --- a/source/net/filebot/media/MediaDetection.java +++ b/source/net/filebot/media/MediaDetection.java @@ -401,7 +401,7 @@ public class MediaDetection { for (File path : listPathTail(f, 2, true)) { String fn = getName(path); // ignore non-strict series name parsing if there are movie year patterns - if (!strict && parseMovieYear(fn).equals(parseNumbers(fn))) { + if (!strict && parseMovieYear(fn).equals(matchIntegers(fn))) { break; } String sn = seriesNameMatcher.matchByEpisodeIdentifier(fn); @@ -763,27 +763,7 @@ public class MediaDetection { } public static List parseMovieYear(String name) { - List years = new ArrayList(); - for (String it : name.split("\\D+")) { - if (it.length() == 4) { - int year = Integer.parseInt(it); - if (1950 < year && year < 2050) { - years.add(year); - } - } - } - return years; - } - - public static List parseNumbers(String name) { - List numbers = new ArrayList(); - for (String it : name.split("\\D+")) { - if (it.length() > 0) { - int n = Integer.parseInt(it); - numbers.add(n); - } - } - return numbers; + return matchIntegers(name).stream().filter(year -> 1950 < year && year < 2050).collect(Collectors.toList()); } public static String reduceMovieName(String name, boolean strict) throws IOException { diff --git a/source/net/filebot/similarity/EpisodeMetrics.java b/source/net/filebot/similarity/EpisodeMetrics.java index e681287f..5ee802e6 100644 --- a/source/net/filebot/similarity/EpisodeMetrics.java +++ b/source/net/filebot/similarity/EpisodeMetrics.java @@ -1,6 +1,7 @@ package net.filebot.similarity; import static java.lang.Math.*; +import static java.util.Arrays.*; import static java.util.Collections.*; import static java.util.regex.Pattern.*; import static net.filebot.media.MediaDetection.*; @@ -19,7 +20,6 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.Scanner; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; @@ -451,11 +451,7 @@ public enum EpisodeMetrics implements SimilarityMetric { } // simplify file name if possible and extract numbers - List numbers = new ArrayList(4); - Scanner scanner = new Scanner(normalizeObject(object)).useDelimiter("\\D+"); - while (scanner.hasNextInt()) { - numbers.add(String.valueOf(scanner.nextInt())); - } + List numbers = matchIntegers(normalizeObject(object)); return join(numbers, " "); } }), diff --git a/source/net/filebot/similarity/NumericSimilarityMetric.java b/source/net/filebot/similarity/NumericSimilarityMetric.java index dae807af..b8cc264f 100644 --- a/source/net/filebot/similarity/NumericSimilarityMetric.java +++ b/source/net/filebot/similarity/NumericSimilarityMetric.java @@ -1,11 +1,10 @@ - package net.filebot.similarity; - import java.util.ArrayList; import java.util.LinkedHashSet; -import java.util.Scanner; import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance; @@ -13,78 +12,65 @@ import uk.ac.shef.wit.simmetrics.tokenisers.InterfaceTokeniser; import uk.ac.shef.wit.simmetrics.wordhandlers.DummyStopTermHandler; import uk.ac.shef.wit.simmetrics.wordhandlers.InterfaceTermHandler; - public class NumericSimilarityMetric implements SimilarityMetric { private final AbstractStringMetric metric; - public NumericSimilarityMetric() { // I don't exactly know why, but I get a good matching behavior // when using QGramsDistance or BlockDistance metric = new QGramsDistance(new NumberTokeniser()); } - @Override public float getSimilarity(Object o1, Object o2) { return metric.getSimilarity(normalize(o1), normalize(o2)); } - protected String normalize(Object object) { // no need to do anything special here, because we don't care about anything but number patterns anyway return object.toString(); } - private static class NumberTokeniser implements InterfaceTokeniser { - private final String delimiter = "\\D+"; - + private static final Pattern DIGIT = Pattern.compile("\\d+"); @Override - public ArrayList tokenizeToArrayList(String input) { + public ArrayList tokenizeToArrayList(String s) { ArrayList tokens = new ArrayList(); - // scan for number patterns, use non-number pattern as delimiter - Scanner scanner = new Scanner(input).useDelimiter(delimiter); - - while (scanner.hasNextInt()) { - // remove leading zeros from number tokens by scanning for Integers - tokens.add(String.valueOf(scanner.nextInt())); + Matcher m = DIGIT.matcher(s); + while (m.find()) { + // remove leading zeros + tokens.add(new Integer(m.group()).toString()); } return tokens; } + @Override + public String getDelimiters() { + return "\\D+"; + } @Override public Set tokenizeToSet(String input) { return new LinkedHashSet(tokenizeToArrayList(input)); } - @Override public String getShortDescriptionString() { return getClass().getSimpleName(); } - - @Override - public String getDelimiters() { - return delimiter; - } - private InterfaceTermHandler stopWordHandler = new DummyStopTermHandler(); - @Override public InterfaceTermHandler getStopWordHandler() { return stopWordHandler; } - @Override public void setStopWordHandler(InterfaceTermHandler stopWordHandler) { this.stopWordHandler = stopWordHandler; diff --git a/source/net/filebot/similarity/SeasonEpisodeMatcher.java b/source/net/filebot/similarity/SeasonEpisodeMatcher.java index 29e6be13..b32fc480 100644 --- a/source/net/filebot/similarity/SeasonEpisodeMatcher.java +++ b/source/net/filebot/similarity/SeasonEpisodeMatcher.java @@ -3,6 +3,7 @@ package net.filebot.similarity; import static java.util.Collections.*; import static java.util.regex.Pattern.*; import static net.filebot.util.FileUtilities.*; +import static net.filebot.util.StringUtilities.*; import java.io.File; import java.util.ArrayList; @@ -10,7 +11,6 @@ import java.util.Arrays; import java.util.Collection; import java.util.LinkedHashSet; import java.util.List; -import java.util.Scanner; import java.util.Set; import java.util.regex.MatchResult; import java.util.regex.Matcher; @@ -54,9 +54,9 @@ public class SeasonEpisodeMatcher { @Override protected Collection process(MatchResult match) { List matches = new ArrayList(2); - Scanner epno = new Scanner(match.group(2)).useDelimiter("\\D+"); - while (epno.hasNext()) { - matches.add(new SxE(match.group(1), epno.next())); + int seasonNumber = Integer.parseInt(match.group(1)); + for (int episodeNumber : matchIntegers(match.group(2))) { + matches.add(new SxE(seasonNumber, episodeNumber)); } return matches; } @@ -68,9 +68,9 @@ public class SeasonEpisodeMatcher { @Override protected Collection process(MatchResult match) { List matches = new ArrayList(2); - String[] num = match.group(0).split("\\D+"); - for (int i = 0; i < num.length; i += 2) { - matches.add(new SxE(num[i], num[i + 1])); // SxE-SxE-SxE + String[] numbers = NON_DIGIT.split(match.group(0)); + for (int i = 0; i < numbers.length; i += 2) { + matches.add(new SxE(numbers[i], numbers[i + 1])); // SxE-SxE-SxE } return matches; } @@ -82,9 +82,9 @@ public class SeasonEpisodeMatcher { @Override protected Collection process(MatchResult match) { List matches = new ArrayList(2); - Scanner epno = new Scanner(match.group(2)).useDelimiter("\\D+"); - while (epno.hasNext()) { - matches.add(new SxE(match.group(1), epno.next())); + int seasonNumber = Integer.parseInt(match.group(1)); + for (int episodeNumber : matchIntegers(match.group(2))) { + matches.add(new SxE(seasonNumber, episodeNumber)); } return matches; } @@ -96,9 +96,9 @@ public class SeasonEpisodeMatcher { @Override protected Collection process(MatchResult match) { List matches = new ArrayList(2); - Scanner epno = new Scanner(match.group(2)).useDelimiter("\\D+"); - while (epno.hasNext()) { - matches.add(new SxE(match.group(1), epno.next())); + int seasonNumber = Integer.parseInt(match.group(1)); + for (int episodeNumber : matchIntegers(match.group(2))) { + matches.add(new SxE(seasonNumber, episodeNumber)); } return matches; } diff --git a/source/net/filebot/util/StringUtilities.java b/source/net/filebot/util/StringUtilities.java index 3f02df49..2b07e0e0 100644 --- a/source/net/filebot/util/StringUtilities.java +++ b/source/net/filebot/util/StringUtilities.java @@ -1,12 +1,44 @@ package net.filebot.util; import static java.util.Arrays.*; +import static java.util.Collections.*; -import java.util.Arrays; +import java.util.ArrayList; import java.util.Comparator; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; public final class StringUtilities { + public static final Pattern DIGIT = Pattern.compile("\\d+"); + public static final Pattern NON_DIGIT = Pattern.compile("\\D+"); + + public static List matchIntegers(CharSequence s) { + if (s == null || s.length() == 0) { + return emptyList(); + } + + List numbers = new ArrayList(); + Matcher matcher = DIGIT.matcher(s); + while (matcher.find()) { + numbers.add(new Integer(matcher.group())); + } + return numbers; + } + + public static Integer matchInteger(CharSequence s) { + if (s == null || s.length() == 0) { + return null; + } + + Matcher matcher = DIGIT.matcher(s); + if (matcher.find()) { + return new Integer(matcher.group()); + } + return null; + } + public static String asString(Object object) { return object == null ? null : object.toString(); } @@ -28,7 +60,7 @@ public final class StringUtilities { } public static String joinSorted(Object[] values, CharSequence delimiter, Comparator sort, CharSequence start, CharSequence end) { - return join(Arrays.stream(values).sorted(sort)::iterator, delimiter, start, end); + return join(stream(values).sorted(sort)::iterator, delimiter, start, end); } public static String join(Iterable values, CharSequence delimiter, CharSequence start, CharSequence end) { diff --git a/source/net/filebot/util/XPathUtilities.java b/source/net/filebot/util/XPathUtilities.java index 62bd91bf..e7446251 100644 --- a/source/net/filebot/util/XPathUtilities.java +++ b/source/net/filebot/util/XPathUtilities.java @@ -3,8 +3,6 @@ package net.filebot.util; import java.util.AbstractList; import java.util.ArrayList; import java.util.List; -import java.util.NoSuchElementException; -import java.util.Scanner; import javax.xml.namespace.QName; import javax.xml.xpath.XPathConstants; @@ -126,14 +124,6 @@ public final class XPathUtilities { return list; } - public static Integer getInteger(String textContent) { - try { - return new Scanner(textContent).useDelimiter("\\D+").nextInt(); - } catch (NumberFormatException | NoSuchElementException | NullPointerException e) { - return null; - } - } - public static Double getDecimal(String textContent) { try { return new Double(textContent); diff --git a/source/net/filebot/web/AnidbClient.java b/source/net/filebot/web/AnidbClient.java index abaeb2b6..ade9b917 100644 --- a/source/net/filebot/web/AnidbClient.java +++ b/source/net/filebot/web/AnidbClient.java @@ -1,6 +1,7 @@ package net.filebot.web; import static java.util.Collections.*; +import static net.filebot.util.StringUtilities.*; import static net.filebot.util.XPathUtilities.*; import static net.filebot.web.EpisodeUtilities.*; import static net.filebot.web.WebRequest.*; @@ -123,7 +124,7 @@ public class AnidbClient extends AbstractEpisodeListProvider { seriesInfo.setName(selectString("anime/titles/title[@type='main']", dom)); seriesInfo.setRating(getDecimal(selectString("anime/ratings/permanent", dom))); - seriesInfo.setRatingCount(getInteger(getTextContent("anime/ratings/permanent/@count", dom))); + seriesInfo.setRatingCount(matchInteger(getTextContent("anime/ratings/permanent/@count", dom))); seriesInfo.setStartDate(SimpleDate.parse(selectString("anime/startdate", dom), "yyyy-MM-dd")); // add categories ordered by weight as genres @@ -132,7 +133,7 @@ public class AnidbClient extends AbstractEpisodeListProvider { // * limit to 5 genres seriesInfo.setGenres(selectNodes("anime/categories/category", dom).stream().map(categoryNode -> { String name = getTextContent("name", categoryNode); - Integer weight = getInteger(getAttribute("weight", categoryNode)); + Integer weight = matchInteger(getAttribute("weight", categoryNode)); return new SimpleImmutableEntry(name, weight); }).filter(nw -> { return nw.getKey() != null && nw.getValue() != null && nw.getKey().length() > 0 && nw.getValue() >= 400; diff --git a/source/net/filebot/web/ID3Lookup.java b/source/net/filebot/web/ID3Lookup.java index 41e0d862..b5407730 100644 --- a/source/net/filebot/web/ID3Lookup.java +++ b/source/net/filebot/web/ID3Lookup.java @@ -1,12 +1,12 @@ package net.filebot.web; +import static net.filebot.util.StringUtilities.*; + import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.LinkedHashMap; import java.util.Map; -import java.util.NoSuchElementException; -import java.util.Scanner; import java.util.logging.Level; import java.util.logging.Logger; @@ -83,15 +83,7 @@ public class ID3Lookup implements MusicIdentificationService { } private Integer getInteger(MediaInfo mediaInfo, String field) { - String value = getString(mediaInfo, field); - if (value != null) { - try { - return new Scanner(value).useDelimiter("\\D+").nextInt(); - } catch (NumberFormatException | NoSuchElementException e) { - Logger.getLogger(ID3Lookup.class.getName()).log(Level.WARNING, e.toString()); - } - } - return null; + return matchInteger(getString(mediaInfo, field)); } } diff --git a/source/net/filebot/web/OMDbClient.java b/source/net/filebot/web/OMDbClient.java index 568011f0..19cef64a 100644 --- a/source/net/filebot/web/OMDbClient.java +++ b/source/net/filebot/web/OMDbClient.java @@ -1,6 +1,7 @@ package net.filebot.web; import static java.util.Collections.*; +import static net.filebot.util.StringUtilities.*; import static net.filebot.web.WebRequest.*; import java.io.File; @@ -17,7 +18,6 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Map.Entry; -import java.util.Scanner; import java.util.TreeMap; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; @@ -123,7 +123,7 @@ public class OMDbClient implements MovieIdentificationService { public Movie getMovie(Map info) { try { String name = info.get("Title"); - int year = new Scanner(info.get("Year")).useDelimiter("\\D+").nextInt(); + int year = matchInteger(info.get("Year")); int imdbid = Integer.parseInt(info.get("imdbID").replace("tt", "")); if (name.length() <= 0 || year <= 1900 || imdbid <= 0) diff --git a/source/net/filebot/web/TMDbClient.java b/source/net/filebot/web/TMDbClient.java index 37c38991..5cc553cb 100644 --- a/source/net/filebot/web/TMDbClient.java +++ b/source/net/filebot/web/TMDbClient.java @@ -2,6 +2,7 @@ package net.filebot.web; import static java.util.Arrays.*; import static java.util.Collections.*; +import static net.filebot.util.StringUtilities.*; import static net.filebot.web.WebRequest.*; import java.io.File; @@ -22,7 +23,6 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.Scanner; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.TimeUnit; @@ -110,7 +110,7 @@ public class TMDbClient implements MovieIdentificationService { id = Float.valueOf(it.get("id").toString()).intValue(); try { String release = (String) it.get("release_date"); - year = new Scanner(release).useDelimiter("\\D+").nextInt(); + year = matchInteger(release); } catch (Exception e) { throw new IllegalArgumentException("Missing data: release date"); } diff --git a/source/net/filebot/web/TheTVDBClient.java b/source/net/filebot/web/TheTVDBClient.java index 647a0236..41fffb58 100644 --- a/source/net/filebot/web/TheTVDBClient.java +++ b/source/net/filebot/web/TheTVDBClient.java @@ -2,6 +2,7 @@ package net.filebot.web; import static java.util.Arrays.*; import static java.util.Collections.*; +import static net.filebot.util.StringUtilities.*; import static net.filebot.util.XPathUtilities.*; import static net.filebot.web.EpisodeUtilities.*; import static net.filebot.web.WebRequest.*; @@ -104,7 +105,7 @@ public class TheTVDBClient extends AbstractEpisodeListProvider { Map resultSet = new LinkedHashMap(); for (Node node : nodes) { - int sid = getInteger(getTextContent("seriesid", node)); + int sid = matchInteger(getTextContent("seriesid", node)); String seriesName = getTextContent("SeriesName", node); List aliasNames = new ArrayList(); @@ -144,8 +145,8 @@ public class TheTVDBClient extends AbstractEpisodeListProvider { seriesInfo.setStatus(getTextContent("Status", seriesNode)); seriesInfo.setRating(getDecimal(getTextContent("Rating", seriesNode))); - seriesInfo.setRatingCount(getInteger(getTextContent("RatingCount", seriesNode))); - seriesInfo.setRuntime(getInteger(getTextContent("Runtime", seriesNode))); + seriesInfo.setRatingCount(matchInteger(getTextContent("RatingCount", seriesNode))); + seriesInfo.setRuntime(matchInteger(getTextContent("Runtime", seriesNode))); seriesInfo.setActors(getListContent("Actors", "\\|", seriesNode)); seriesInfo.setGenres(getListContent("Genre", "\\|", seriesNode)); seriesInfo.setStartDate(SimpleDate.parse(getTextContent("FirstAired", seriesNode), "yyyy-MM-dd")); @@ -162,17 +163,17 @@ public class TheTVDBClient extends AbstractEpisodeListProvider { for (Node node : nodes) { String episodeName = getTextContent("EpisodeName", node); - Integer absoluteNumber = getInteger(getTextContent("absolute_number", node)); + Integer absoluteNumber = matchInteger(getTextContent("absolute_number", node)); SimpleDate airdate = SimpleDate.parse(getTextContent("FirstAired", node), "yyyy-MM-dd"); // default numbering - Integer episodeNumber = getInteger(getTextContent("EpisodeNumber", node)); - Integer seasonNumber = getInteger(getTextContent("SeasonNumber", node)); + Integer episodeNumber = matchInteger(getTextContent("EpisodeNumber", node)); + Integer seasonNumber = matchInteger(getTextContent("SeasonNumber", node)); // adjust for DVD numbering if possible if (sortOrder == SortOrder.DVD) { - Integer dvdSeasonNumber = getInteger(getTextContent("DVD_season", node)); - Integer dvdEpisodeNumber = getInteger(getTextContent("DVD_episodenumber", node)); + Integer dvdSeasonNumber = matchInteger(getTextContent("DVD_season", node)); + Integer dvdEpisodeNumber = matchInteger(getTextContent("DVD_episodenumber", node)); // require both values to be valid integer numbers if (dvdSeasonNumber != null && dvdEpisodeNumber != null) { @@ -185,7 +186,7 @@ public class TheTVDBClient extends AbstractEpisodeListProvider { if (seasonNumber == null || seasonNumber == 0) { // handle as special episode for (String specialSeasonTag : new String[] { "airsafter_season", "airsbefore_season" }) { - Integer specialSeason = getInteger(getTextContent(specialSeasonTag, node)); + Integer specialSeason = matchInteger(getTextContent(specialSeasonTag, node)); if (specialSeason != null && specialSeason != 0) { seasonNumber = specialSeason; break;