From 8bb76134715875c0290bd196f75d08730a7f9117 Mon Sep 17 00:00:00 2001 From: Jesse Vincent Date: Fri, 14 Jan 2011 01:37:46 +0000 Subject: [PATCH] Update URL regexes by importing from AOSP and then from IANA --- src/com/fsck/k9/helper/Regex.java | 194 +++++++++++++++++------------- 1 file changed, 113 insertions(+), 81 deletions(-) diff --git a/src/com/fsck/k9/helper/Regex.java b/src/com/fsck/k9/helper/Regex.java index fddb6ed92..a768c09d7 100644 --- a/src/com/fsck/k9/helper/Regex.java +++ b/src/com/fsck/k9/helper/Regex.java @@ -12,6 +12,11 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. + * + * Imported from AOSP on 2011-01-12 by JRV. + * Domain patterns updated from IANA on 2010-01-12 + * + * */ package com.fsck.k9.helper; @@ -19,89 +24,115 @@ package com.fsck.k9.helper; import java.util.regex.Matcher; import java.util.regex.Pattern; +/** + * Commonly used regular expression patterns. + */ public class Regex { - /** - * Regular expression pattern to match all IANA top-level domains. - * List accurate as of 2007/06/15. List taken from: - * http://data.iana.org/TLD/tlds-alpha-by-domain.txt - * This pattern is auto-generated by //device/tools/make-iana-tld-pattern.py - */ - public static final Pattern TOP_LEVEL_DOMAIN_PATTERN - = Pattern.compile( - "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])" - + "|(biz|b[abdefghijmnorstvwyz])" - + "|(cat|com|coop|c[acdfghiklmnoruvxyz])" - + "|d[ejkmoz]" - + "|(edu|e[cegrstu])" - + "|f[ijkmor]" - + "|(gov|g[abdefghilmnpqrstuwy])" - + "|h[kmnrtu]" - + "|(info|int|i[delmnoqrst])" - + "|(jobs|j[emop])" - + "|k[eghimnrwyz]" - + "|l[abcikrstuvy]" - + "|(mil|mobi|museum|m[acdghklmnopqrstuvwxyz])" - + "|(name|net|n[acefgilopruz])" - + "|(org|om)" - + "|(pro|p[aefghklmnrstwy])" - + "|qa" - + "|r[eouw]" - + "|s[abcdeghijklmnortuvyz]" - + "|(tel|travel|t[cdfghjklmnoprtvwz])" - + "|u[agkmsyz]" - + "|v[aceginu]" - + "|w[fs]" - + "|y[etu]" - + "|z[amw])"); /** - * Regular expression pattern to match RFC 1738 URLs - * List accurate as of 2007/06/15. List taken from: + * Regular expression to match all IANA top-level domains. + * List accurate as of 2011/01/12. List taken from: * http://data.iana.org/TLD/tlds-alpha-by-domain.txt - * This pattern is auto-generated by //device/tools/make-iana-tld-pattern.py + * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py */ - public static final Pattern WEB_URL_PATTERN - = Pattern.compile( - "((?:(http|https|Http|Https):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" - + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" - + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" - + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host - + "(?:" // plus top level domain - + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])" - + "|(?:biz|b[abdefghijmnorstvwyz])" - + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])" - + "|d[ejkmoz]" - + "|(?:edu|e[cegrstu])" - + "|f[ijkmor]" - + "|(?:gov|g[abdefghilmnpqrstuwy])" - + "|h[kmnrtu]" - + "|(?:info|int|i[delmnoqrst])" - + "|(?:jobs|j[emop])" - + "|k[eghimnrwyz]" - + "|l[abcikrstuvy]" - + "|(?:mil|mobi|museum|m[acdghklmnopqrstuvwxyz])" - + "|(?:name|net|n[acefgilopruz])" - + "|(?:org|om)" - + "|(?:pro|p[aefghklmnrstwy])" - + "|qa" - + "|r[eouw]" - + "|s[abcdeghijklmnortuvyz]" - + "|(?:tel|travel|t[cdfghjklmnoprtvwz])" - + "|u[agkmsyz]" - + "|v[aceginu]" - + "|w[fs]" - + "|y[etu]" - + "|z[amw]))" - + "|(?:(?:25[0-5]|2[0-4]" // or ip address - + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" - + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" - + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" - + "|[1-9][0-9]|[0-9])))" - + "(?:\\:\\d{1,5})?)" // plus option port number - + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params - + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" - + "(?:\\b|$)"); // and finally, a word boundary or end of + public static final String TOP_LEVEL_DOMAIN_STR = + "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + + "|(biz|b[abdefghijmnorstvwyz])" + + "|(cat|com|coop|c[acdfghiklmnoruvxyz])" + + "|d[ejkmoz]" + + "|(edu|e[cegrstu])" + + "|f[ijkmor]" + + "|(gov|g[abdefghilmnpqrstuwy])" + + "|h[kmnrtu]" + + "|(info|int|i[delmnoqrst])" + + "|(jobs|j[emop])" + + "|k[eghimnprwyz]" + + "|l[abcikrstuvy]" + + "|(mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])" + + "|(name|net|n[acefgilopruz])" + + "|(org|om)" + + "|(pro|p[aefghklmnrstwy])" + + "|qa" + + "|r[eosuw]" + + "|s[abcdeghijklmnortuvyz]" + + "|(tel|travel|t[cdfghjklmnoprtvwz])" + + "|u[agksyz]" + + "|v[aceginu]" + + "|w[fs]" + + "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah)" + + "|y[et]" + + "|z[amw])"; + + /** + * Regular expression pattern to match all IANA top-level domains. + */ + public static final Pattern TOP_LEVEL_DOMAIN_PATTERN = + Pattern.compile(TOP_LEVEL_DOMAIN_STR); + + + /** + * Goegular expression to match all IANA top-level domains for WEB_URL. + * List accurate as of 2011/01/12. List taken from: + * http://data.iana.org/TLD/tlds-alpha-by-domain.txt + * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py + */ + public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = + "(?:" + + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + + "|(?:biz|b[abdefghijmnorstvwyz])" + + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])" + + "|d[ejkmoz]" + + "|(?:edu|e[cegrstu])" + + "|f[ijkmor]" + + "|(?:gov|g[abdefghilmnpqrstuwy])" + + "|h[kmnrtu]" + + "|(?:info|int|i[delmnoqrst])" + + "|(?:jobs|j[emop])" + + "|k[eghimnprwyz]" + + "|l[abcikrstuvy]" + + "|(?:mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])" + + "|(?:name|net|n[acefgilopruz])" + + "|(?:org|om)" + + "|(?:pro|p[aefghklmnrstwy])" + + "|qa" + + "|r[eosuw]" + + "|s[abcdeghijklmnortuvyz]" + + "|(?:tel|travel|t[cdfghjklmnoprtvwz])" + + "|u[agksyz]" + + "|v[aceginu]" + + "|w[fs]" + + "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah)" + + "|y[et]" + + "|z[amw]))"; + + /* This comprises most common used Unicode characters allowed in IRI + * as detailed in RFC 3987. + * Specifically, those two byte Unicode characters are not included. + */ + public static final String GOOD_IRI_CHAR = + "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"; + + /** + * Regular expression pattern to match most part of RFC 3987 + * Internationalized URLs, aka IRIs. Commonly used Unicode characters are + * added. + */ + public static final Pattern WEB_URL_PATTERN = Pattern.compile( + "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+" // named host + + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL + + "|(?:(?:25[0-5]|2[0-4]" // or ip address + + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" + + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" + + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" + + "|[1-9][0-9]|[0-9])))" + + "(?:\\:\\d{1,5})?)" // plus option port number + + "(\\/(?:(?:[" + GOOD_IRI_CHAR + "\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params + + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" + + "(?:\\b|$)"); // and finally, a word boundary or end of // input. This is to stop foo.sure from // matching as foo.su @@ -114,13 +145,13 @@ public class Regex public static final Pattern DOMAIN_NAME_PATTERN = Pattern.compile( - "(((([a-zA-Z0-9][a-zA-Z0-9\\-]*)*[a-zA-Z0-9]\\.)+" + "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+" + TOP_LEVEL_DOMAIN_PATTERN + ")|" + IP_ADDRESS_PATTERN + ")"); public static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile( - "[a-zA-Z0-9\\+\\.\\_\\%\\-]{1,256}" + + "[a-zA-Z0-9\\+\\.\\_\\%\\-\\+]{1,256}" + "\\@" + "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}" + "(" + @@ -159,7 +190,7 @@ public class Regex * @return A String comprising all of the non-null matched * groups concatenated together */ - public static String concatGroups(Matcher matcher) + public static final String concatGroups(Matcher matcher) { StringBuilder b = new StringBuilder(); final int numGroups = matcher.groupCount(); @@ -189,7 +220,7 @@ public class Regex * @return A String comprising all of the digits and plus in * the match */ - public static String digitsAndPlusOnly(Matcher matcher) + public static final String digitsAndPlusOnly(Matcher matcher) { StringBuilder buffer = new StringBuilder(); String matchingRegion = matcher.group(); @@ -205,4 +236,5 @@ public class Regex } return buffer.toString(); } + }