Update URL regexes by importing from AOSP and then from IANA

This commit is contained in:
Jesse Vincent 2011-01-14 01:37:46 +00:00
parent 6f4380d775
commit 8bb7613471
1 changed files with 113 additions and 81 deletions

View File

@ -12,6 +12,11 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*
* Imported from AOSP on 2011-01-12 by JRV.
* Domain patterns updated from IANA on 2010-01-12
*
*
*/ */
package com.fsck.k9.helper; package com.fsck.k9.helper;
@ -19,89 +24,115 @@ package com.fsck.k9.helper;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
/**
* Commonly used regular expression patterns.
*/
public class Regex public class Regex
{ {
/**
* Regular expression pattern to match all IANA top-level domains.
* List accurate as of 2007/06/15. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
* This pattern is auto-generated by //device/tools/make-iana-tld-pattern.py
*/
public static final Pattern TOP_LEVEL_DOMAIN_PATTERN
= Pattern.compile(
"((aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
+ "|(biz|b[abdefghijmnorstvwyz])"
+ "|(cat|com|coop|c[acdfghiklmnoruvxyz])"
+ "|d[ejkmoz]"
+ "|(edu|e[cegrstu])"
+ "|f[ijkmor]"
+ "|(gov|g[abdefghilmnpqrstuwy])"
+ "|h[kmnrtu]"
+ "|(info|int|i[delmnoqrst])"
+ "|(jobs|j[emop])"
+ "|k[eghimnrwyz]"
+ "|l[abcikrstuvy]"
+ "|(mil|mobi|museum|m[acdghklmnopqrstuvwxyz])"
+ "|(name|net|n[acefgilopruz])"
+ "|(org|om)"
+ "|(pro|p[aefghklmnrstwy])"
+ "|qa"
+ "|r[eouw]"
+ "|s[abcdeghijklmnortuvyz]"
+ "|(tel|travel|t[cdfghjklmnoprtvwz])"
+ "|u[agkmsyz]"
+ "|v[aceginu]"
+ "|w[fs]"
+ "|y[etu]"
+ "|z[amw])");
/** /**
* Regular expression pattern to match RFC 1738 URLs * Regular expression to match all IANA top-level domains.
* List accurate as of 2007/06/15. List taken from: * List accurate as of 2011/01/12. List taken from:
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
* This pattern is auto-generated by //device/tools/make-iana-tld-pattern.py * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
*/ */
public static final Pattern WEB_URL_PATTERN public static final String TOP_LEVEL_DOMAIN_STR =
= Pattern.compile( "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
"((?:(http|https|Http|Https):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + "|(biz|b[abdefghijmnorstvwyz])"
+ "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + "|(cat|com|coop|c[acdfghiklmnoruvxyz])"
+ "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + "|d[ejkmoz]"
+ "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host + "|(edu|e[cegrstu])"
+ "(?:" // plus top level domain + "|f[ijkmor]"
+ "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])" + "|(gov|g[abdefghilmnpqrstuwy])"
+ "|(?:biz|b[abdefghijmnorstvwyz])" + "|h[kmnrtu]"
+ "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])" + "|(info|int|i[delmnoqrst])"
+ "|d[ejkmoz]" + "|(jobs|j[emop])"
+ "|(?:edu|e[cegrstu])" + "|k[eghimnprwyz]"
+ "|f[ijkmor]" + "|l[abcikrstuvy]"
+ "|(?:gov|g[abdefghilmnpqrstuwy])" + "|(mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
+ "|h[kmnrtu]" + "|(name|net|n[acefgilopruz])"
+ "|(?:info|int|i[delmnoqrst])" + "|(org|om)"
+ "|(?:jobs|j[emop])" + "|(pro|p[aefghklmnrstwy])"
+ "|k[eghimnrwyz]" + "|qa"
+ "|l[abcikrstuvy]" + "|r[eosuw]"
+ "|(?:mil|mobi|museum|m[acdghklmnopqrstuvwxyz])" + "|s[abcdeghijklmnortuvyz]"
+ "|(?:name|net|n[acefgilopruz])" + "|(tel|travel|t[cdfghjklmnoprtvwz])"
+ "|(?:org|om)" + "|u[agksyz]"
+ "|(?:pro|p[aefghklmnrstwy])" + "|v[aceginu]"
+ "|qa" + "|w[fs]"
+ "|r[eouw]" + "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah)"
+ "|s[abcdeghijklmnortuvyz]" + "|y[et]"
+ "|(?:tel|travel|t[cdfghjklmnoprtvwz])" + "|z[amw])";
+ "|u[agkmsyz]"
+ "|v[aceginu]" /**
+ "|w[fs]" * Regular expression pattern to match all IANA top-level domains.
+ "|y[etu]" */
+ "|z[amw]))" public static final Pattern TOP_LEVEL_DOMAIN_PATTERN =
+ "|(?:(?:25[0-5]|2[0-4]" // or ip address Pattern.compile(TOP_LEVEL_DOMAIN_STR);
+ "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
+ "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
+ "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" /**
+ "|[1-9][0-9]|[0-9])))" * Goegular expression to match all IANA top-level domains for WEB_URL.
+ "(?:\\:\\d{1,5})?)" // plus option port number * List accurate as of 2011/01/12. List taken from:
+ "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params * http://data.iana.org/TLD/tlds-alpha-by-domain.txt
+ "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
+ "(?:\\b|$)"); // and finally, a word boundary or end of */
public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
"(?:"
+ "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
+ "|(?:biz|b[abdefghijmnorstvwyz])"
+ "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
+ "|d[ejkmoz]"
+ "|(?:edu|e[cegrstu])"
+ "|f[ijkmor]"
+ "|(?:gov|g[abdefghilmnpqrstuwy])"
+ "|h[kmnrtu]"
+ "|(?:info|int|i[delmnoqrst])"
+ "|(?:jobs|j[emop])"
+ "|k[eghimnprwyz]"
+ "|l[abcikrstuvy]"
+ "|(?:mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
+ "|(?:name|net|n[acefgilopruz])"
+ "|(?:org|om)"
+ "|(?:pro|p[aefghklmnrstwy])"
+ "|qa"
+ "|r[eosuw]"
+ "|s[abcdeghijklmnortuvyz]"
+ "|(?:tel|travel|t[cdfghjklmnoprtvwz])"
+ "|u[agksyz]"
+ "|v[aceginu]"
+ "|w[fs]"
+ "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah)"
+ "|y[et]"
+ "|z[amw]))";
/* This comprises most common used Unicode characters allowed in IRI
* as detailed in RFC 3987.
* Specifically, those two byte Unicode characters are not included.
*/
public static final String GOOD_IRI_CHAR =
"a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF";
/**
* Regular expression pattern to match most part of RFC 3987
* Internationalized URLs, aka IRIs. Commonly used Unicode characters are
* added.
*/
public static final Pattern WEB_URL_PATTERN = Pattern.compile(
"((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
+ "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
+ "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
+ "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+" // named host
+ TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL
+ "|(?:(?:25[0-5]|2[0-4]" // or ip address
+ "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
+ "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
+ "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
+ "|[1-9][0-9]|[0-9])))"
+ "(?:\\:\\d{1,5})?)" // plus option port number
+ "(\\/(?:(?:[" + GOOD_IRI_CHAR + "\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
+ "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
+ "(?:\\b|$)"); // and finally, a word boundary or end of
// input. This is to stop foo.sure from // input. This is to stop foo.sure from
// matching as foo.su // matching as foo.su
@ -114,13 +145,13 @@ public class Regex
public static final Pattern DOMAIN_NAME_PATTERN public static final Pattern DOMAIN_NAME_PATTERN
= Pattern.compile( = Pattern.compile(
"(((([a-zA-Z0-9][a-zA-Z0-9\\-]*)*[a-zA-Z0-9]\\.)+" "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+"
+ TOP_LEVEL_DOMAIN_PATTERN + ")|" + TOP_LEVEL_DOMAIN_PATTERN + ")|"
+ IP_ADDRESS_PATTERN + ")"); + IP_ADDRESS_PATTERN + ")");
public static final Pattern EMAIL_ADDRESS_PATTERN public static final Pattern EMAIL_ADDRESS_PATTERN
= Pattern.compile( = Pattern.compile(
"[a-zA-Z0-9\\+\\.\\_\\%\\-]{1,256}" + "[a-zA-Z0-9\\+\\.\\_\\%\\-\\+]{1,256}" +
"\\@" + "\\@" +
"[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}" + "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}" +
"(" + "(" +
@ -159,7 +190,7 @@ public class Regex
* @return A String comprising all of the non-null matched * @return A String comprising all of the non-null matched
* groups concatenated together * groups concatenated together
*/ */
public static String concatGroups(Matcher matcher) public static final String concatGroups(Matcher matcher)
{ {
StringBuilder b = new StringBuilder(); StringBuilder b = new StringBuilder();
final int numGroups = matcher.groupCount(); final int numGroups = matcher.groupCount();
@ -189,7 +220,7 @@ public class Regex
* @return A String comprising all of the digits and plus in * @return A String comprising all of the digits and plus in
* the match * the match
*/ */
public static String digitsAndPlusOnly(Matcher matcher) public static final String digitsAndPlusOnly(Matcher matcher)
{ {
StringBuilder buffer = new StringBuilder(); StringBuilder buffer = new StringBuilder();
String matchingRegion = matcher.group(); String matchingRegion = matcher.group();
@ -205,4 +236,5 @@ public class Regex
} }
return buffer.toString(); return buffer.toString();
} }
} }