2009-06-06 11:46:17 -04:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2007 The Android Open Source Project
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
2011-01-13 20:37:46 -05:00
|
|
|
*
|
|
|
|
* Imported from AOSP on 2011-01-12 by JRV.
|
|
|
|
* Domain patterns updated from IANA on 2010-01-12
|
|
|
|
*
|
|
|
|
*
|
2009-06-06 11:46:17 -04:00
|
|
|
*/
|
|
|
|
|
2010-05-19 15:16:36 -04:00
|
|
|
package com.fsck.k9.helper;
|
2009-06-06 11:46:17 -04:00
|
|
|
|
|
|
|
import java.util.regex.Matcher;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
|
2011-01-13 20:37:46 -05:00
|
|
|
/**
|
|
|
|
* Commonly used regular expression patterns.
|
|
|
|
*/
|
2010-05-19 15:16:36 -04:00
|
|
|
public class Regex
|
|
|
|
{
|
2011-01-13 20:37:46 -05:00
|
|
|
|
2009-06-06 11:46:17 -04:00
|
|
|
/**
|
2011-01-13 20:37:46 -05:00
|
|
|
* Regular expression to match all IANA top-level domains.
|
|
|
|
* List accurate as of 2011/01/12. List taken from:
|
2009-06-06 11:46:17 -04:00
|
|
|
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
2011-01-13 20:37:46 -05:00
|
|
|
* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
|
2009-06-06 11:46:17 -04:00
|
|
|
*/
|
2011-01-13 20:37:46 -05:00
|
|
|
public static final String TOP_LEVEL_DOMAIN_STR =
|
|
|
|
"((aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
|
|
|
|
+ "|(biz|b[abdefghijmnorstvwyz])"
|
|
|
|
+ "|(cat|com|coop|c[acdfghiklmnoruvxyz])"
|
|
|
|
+ "|d[ejkmoz]"
|
|
|
|
+ "|(edu|e[cegrstu])"
|
|
|
|
+ "|f[ijkmor]"
|
|
|
|
+ "|(gov|g[abdefghilmnpqrstuwy])"
|
|
|
|
+ "|h[kmnrtu]"
|
|
|
|
+ "|(info|int|i[delmnoqrst])"
|
|
|
|
+ "|(jobs|j[emop])"
|
|
|
|
+ "|k[eghimnprwyz]"
|
|
|
|
+ "|l[abcikrstuvy]"
|
|
|
|
+ "|(mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
|
|
|
|
+ "|(name|net|n[acefgilopruz])"
|
|
|
|
+ "|(org|om)"
|
|
|
|
+ "|(pro|p[aefghklmnrstwy])"
|
|
|
|
+ "|qa"
|
|
|
|
+ "|r[eosuw]"
|
|
|
|
+ "|s[abcdeghijklmnortuvyz]"
|
|
|
|
+ "|(tel|travel|t[cdfghjklmnoprtvwz])"
|
|
|
|
+ "|u[agksyz]"
|
|
|
|
+ "|v[aceginu]"
|
|
|
|
+ "|w[fs]"
|
|
|
|
+ "|(xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah)"
|
|
|
|
+ "|y[et]"
|
|
|
|
+ "|z[amw])";
|
2009-06-06 11:46:17 -04:00
|
|
|
|
|
|
|
/**
|
2011-01-13 20:37:46 -05:00
|
|
|
* Regular expression pattern to match all IANA top-level domains.
|
|
|
|
*/
|
|
|
|
public static final Pattern TOP_LEVEL_DOMAIN_PATTERN =
|
|
|
|
Pattern.compile(TOP_LEVEL_DOMAIN_STR);
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Goegular expression to match all IANA top-level domains for WEB_URL.
|
|
|
|
* List accurate as of 2011/01/12. List taken from:
|
2009-06-06 11:46:17 -04:00
|
|
|
* http://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
2011-01-13 20:37:46 -05:00
|
|
|
* This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py
|
2009-06-06 11:46:17 -04:00
|
|
|
*/
|
2011-01-13 20:37:46 -05:00
|
|
|
public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
|
|
|
|
"(?:"
|
|
|
|
+ "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
|
|
|
|
+ "|(?:biz|b[abdefghijmnorstvwyz])"
|
|
|
|
+ "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
|
|
|
|
+ "|d[ejkmoz]"
|
|
|
|
+ "|(?:edu|e[cegrstu])"
|
|
|
|
+ "|f[ijkmor]"
|
|
|
|
+ "|(?:gov|g[abdefghilmnpqrstuwy])"
|
|
|
|
+ "|h[kmnrtu]"
|
|
|
|
+ "|(?:info|int|i[delmnoqrst])"
|
|
|
|
+ "|(?:jobs|j[emop])"
|
|
|
|
+ "|k[eghimnprwyz]"
|
|
|
|
+ "|l[abcikrstuvy]"
|
|
|
|
+ "|(?:mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
|
|
|
|
+ "|(?:name|net|n[acefgilopruz])"
|
|
|
|
+ "|(?:org|om)"
|
|
|
|
+ "|(?:pro|p[aefghklmnrstwy])"
|
|
|
|
+ "|qa"
|
|
|
|
+ "|r[eosuw]"
|
|
|
|
+ "|s[abcdeghijklmnortuvyz]"
|
|
|
|
+ "|(?:tel|travel|t[cdfghjklmnoprtvwz])"
|
|
|
|
+ "|u[agksyz]"
|
|
|
|
+ "|v[aceginu]"
|
|
|
|
+ "|w[fs]"
|
|
|
|
+ "|(?:xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-80akhbyknj4f|xn\\-\\-9t4b11yi5a|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah)"
|
|
|
|
+ "|y[et]"
|
|
|
|
+ "|z[amw]))";
|
|
|
|
|
|
|
|
/* This comprises most common used Unicode characters allowed in IRI
|
|
|
|
* as detailed in RFC 3987.
|
|
|
|
* Specifically, those two byte Unicode characters are not included.
|
|
|
|
*/
|
|
|
|
public static final String GOOD_IRI_CHAR =
|
|
|
|
"a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF";
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Regular expression pattern to match most part of RFC 3987
|
|
|
|
* Internationalized URLs, aka IRIs. Commonly used Unicode characters are
|
|
|
|
* added.
|
|
|
|
*/
|
|
|
|
public static final Pattern WEB_URL_PATTERN = Pattern.compile(
|
|
|
|
"((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
|
|
|
|
+ "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
|
|
|
|
+ "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
|
|
|
|
+ "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+" // named host
|
|
|
|
+ TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL
|
|
|
|
+ "|(?:(?:25[0-5]|2[0-4]" // or ip address
|
|
|
|
+ "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
|
|
|
|
+ "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
|
|
|
|
+ "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
|
|
|
|
+ "|[1-9][0-9]|[0-9])))"
|
|
|
|
+ "(?:\\:\\d{1,5})?)" // plus option port number
|
|
|
|
+ "(\\/(?:(?:[" + GOOD_IRI_CHAR + "\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
|
|
|
|
+ "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
|
|
|
|
+ "(?:\\b|$)"); // and finally, a word boundary or end of
|
2010-05-30 00:17:00 -04:00
|
|
|
// input. This is to stop foo.sure from
|
|
|
|
// matching as foo.su
|
2009-06-06 11:46:17 -04:00
|
|
|
|
|
|
|
public static final Pattern IP_ADDRESS_PATTERN
|
2010-05-30 00:17:00 -04:00
|
|
|
= Pattern.compile(
|
|
|
|
"((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4]"
|
|
|
|
+ "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]"
|
|
|
|
+ "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
|
|
|
|
+ "|[1-9][0-9]|[0-9]))");
|
2009-06-06 11:46:17 -04:00
|
|
|
|
|
|
|
public static final Pattern DOMAIN_NAME_PATTERN
|
2010-05-30 00:17:00 -04:00
|
|
|
= Pattern.compile(
|
2011-01-13 20:37:46 -05:00
|
|
|
"(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+"
|
2010-05-30 00:17:00 -04:00
|
|
|
+ TOP_LEVEL_DOMAIN_PATTERN + ")|"
|
|
|
|
+ IP_ADDRESS_PATTERN + ")");
|
2009-06-06 11:46:17 -04:00
|
|
|
|
|
|
|
public static final Pattern EMAIL_ADDRESS_PATTERN
|
2010-05-30 00:17:00 -04:00
|
|
|
= Pattern.compile(
|
2011-01-13 20:37:46 -05:00
|
|
|
"[a-zA-Z0-9\\+\\.\\_\\%\\-\\+]{1,256}" +
|
2010-05-30 00:17:00 -04:00
|
|
|
"\\@" +
|
|
|
|
"[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}" +
|
|
|
|
"(" +
|
|
|
|
"\\." +
|
|
|
|
"[a-zA-Z0-9][a-zA-Z0-9\\-]{0,25}" +
|
|
|
|
")+"
|
|
|
|
);
|
2009-06-06 11:46:17 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* This pattern is intended for searching for things that look like they
|
|
|
|
* might be phone numbers in arbitrary text, not for validating whether
|
|
|
|
* something is in fact a phone number. It will miss many things that
|
|
|
|
* are legitimate phone numbers.
|
|
|
|
*
|
|
|
|
* <p> The pattern matches the following:
|
|
|
|
* <ul>
|
|
|
|
* <li>Optionally, a + sign followed immediately by one or more digits. Spaces, dots, or dashes
|
|
|
|
* may follow.
|
|
|
|
* <li>Optionally, sets of digits in parentheses, separated by spaces, dots, or dashes.
|
|
|
|
* <li>A string starting and ending with a digit, containing digits, spaces, dots, and/or dashes.
|
|
|
|
* </ul>
|
|
|
|
*/
|
|
|
|
public static final Pattern PHONE_PATTERN
|
2010-05-30 00:17:00 -04:00
|
|
|
= Pattern.compile( // sdd = space, dot, or dash
|
|
|
|
"(\\+[0-9]+[\\- \\.]*)?" // +<digits><sdd>*
|
|
|
|
+ "(\\([0-9]+\\)[\\- \\.]*)?" // (<digits>)<sdd>*
|
|
|
|
+ "([0-9][0-9\\- \\.][0-9\\- \\.]+[0-9])"); // <digit><digit|sdd>+<digit>
|
2009-06-06 11:46:17 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Convenience method to take all of the non-null matching groups in a
|
|
|
|
* regex Matcher and return them as a concatenated string.
|
|
|
|
*
|
|
|
|
* @param matcher The Matcher object from which grouped text will
|
|
|
|
* be extracted
|
|
|
|
*
|
|
|
|
* @return A String comprising all of the non-null matched
|
|
|
|
* groups concatenated together
|
|
|
|
*/
|
2011-01-13 20:37:46 -05:00
|
|
|
public static final String concatGroups(Matcher matcher)
|
2010-05-19 15:16:36 -04:00
|
|
|
{
|
2009-06-06 11:46:17 -04:00
|
|
|
StringBuilder b = new StringBuilder();
|
|
|
|
final int numGroups = matcher.groupCount();
|
|
|
|
|
2010-05-19 15:16:36 -04:00
|
|
|
for (int i = 1; i <= numGroups; i++)
|
|
|
|
{
|
2009-06-06 11:46:17 -04:00
|
|
|
String s = matcher.group(i);
|
|
|
|
|
|
|
|
System.err.println("Group(" + i + ") : " + s);
|
|
|
|
|
2010-05-19 15:16:36 -04:00
|
|
|
if (s != null)
|
|
|
|
{
|
2009-06-06 11:46:17 -04:00
|
|
|
b.append(s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return b.toString();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Convenience method to return only the digits and plus signs
|
|
|
|
* in the matching string.
|
|
|
|
*
|
|
|
|
* @param matcher The Matcher object from which digits and plus will
|
|
|
|
* be extracted
|
|
|
|
*
|
|
|
|
* @return A String comprising all of the digits and plus in
|
|
|
|
* the match
|
|
|
|
*/
|
2011-01-13 20:37:46 -05:00
|
|
|
public static final String digitsAndPlusOnly(Matcher matcher)
|
2010-05-19 15:16:36 -04:00
|
|
|
{
|
2009-06-06 11:46:17 -04:00
|
|
|
StringBuilder buffer = new StringBuilder();
|
|
|
|
String matchingRegion = matcher.group();
|
|
|
|
|
2010-05-19 15:16:36 -04:00
|
|
|
for (int i = 0, size = matchingRegion.length(); i < size; i++)
|
|
|
|
{
|
2009-06-06 11:46:17 -04:00
|
|
|
char character = matchingRegion.charAt(i);
|
|
|
|
|
2010-05-19 15:16:36 -04:00
|
|
|
if (character == '+' || Character.isDigit(character))
|
|
|
|
{
|
2009-06-06 11:46:17 -04:00
|
|
|
buffer.append(character);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return buffer.toString();
|
|
|
|
}
|
2011-01-13 20:37:46 -05:00
|
|
|
|
2009-06-06 11:46:17 -04:00
|
|
|
}
|