可能な限りRFCに準拠したEメールアドレス検証用正規表現という記事を以前投稿しましたが、今回はそのURI版です。
前回と違い、今度は「可能な限り」ではなくて 完全準拠 しています。多分。
はじめに
ウェブサービスを作っていると、入力されたURIが正しい形式か確かめたいということがあると思います。
今ならHTML5でinput
要素にtype="url"
が使えますし、そもそも検証せずとも実際にアクセスしてみればいいのですが、やっぱり事前に検証したいというときもありますよね。
ただ、RFC3986で規定されているURIの形式って結構複雑で、ほとんどのサービスでは簡易的な正規表現でチェックしてたりします。
そこで、RFCに準拠した正規表現に挑戦してみました。
とりあえず結果教えろ
はい。
/^[a-z]([a-z]|[0-9]|[+\-.])*:(\/\/((([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=]|:)*@)?(\[((([0-9a-f]{1,4}:){6}([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|::([0-9a-f]{1,4}:){5}([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|([0-9a-f]{1,4})?::([0-9a-f]{1,4}:){4}([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|(([0-9a-f]{1,4}:){0,1}[0-9a-f]{1,4})?::([0-9a-f]{1,4}:){3}([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|(([0-9a-f]{1,4}:){0,2}[0-9a-f]{1,4})?::([0-9a-f]{1,4}:){2}([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|(([0-9a-f]{1,4}:){0,3}[0-9a-f]{1,4})?::[0-9a-f]{1,4}:([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|(([0-9a-f]{1,4}:){0,4}[0-9a-f]{1,4})?::([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|(([0-9a-f]{1,4}:){0,5}[0-9a-f]{1,4})?::[0-9a-f]{1,4}|(([0-9a-f]{1,4}:){0,6}[0-9a-f]{1,4})?::)|v[0-9a-f]+\.(([a-z]|[0-9]|[-._~])|[!$&'()*+,;=]|:)+)]|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3}|(([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=])*)(:\d*)?(\/((([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=]|[:@]))*)*|\/(((([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=]|[:@]))+(\/((([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=]|[:@]))*)*)?|((([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=]|[:@]))+(\/((([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=]|[:@]))*)*|)(\?((([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=]|[:@])|[\/?])*)?(#((([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=]|[:@])|[\/?])*)?$/i
わけわかんねーよ解説しろよ
そんなこといわれても解説できません。こっちだってわけわかんねーんだよヽ(`Д´#)ノウワァァァン
生成過程を公開しますので、がんばって解析してください。
(ECMAScriptで書いてます)
const ALPHA = `[a-z]`;
const DIGIT = `[0-9]`;
const HEXDIG = `[0-9a-f]`;
// IPv4
const DEC_OCTET_1 = DIGIT; // 0-9
const DEC_OCTET_2 = `[1-9]${DIGIT}`; // 10-99
const DEC_OCTET_3 = `1${DIGIT}{2}`; // 100-199
const DEC_OCTET_4 = `2[0-4]${DIGIT}`; // 200-249
const DEC_OCTET_5 = `25[0-5]`; // 250-255
const DEC_OCTET = `(${DEC_OCTET_1}|${DEC_OCTET_2}|${DEC_OCTET_3}|${DEC_OCTET_4}|${DEC_OCTET_5})`;
const IPV4ADDRESS = `${DEC_OCTET}(\\.${DEC_OCTET}){3}`;
// IPv6
const H16 = `${HEXDIG}{1,4}`; // 16 bits of address represented in hexadecimal
const LS32 = `(${H16}:${H16}|${IPV4ADDRESS})`; // least-significant 32 bits of address
const IPV6ADDRESS_1 = `(${H16}:){6}${LS32}`;
const IPV6ADDRESS_2 = `::(${H16}:){5}${LS32}`;
const IPV6ADDRESS_3 = `(${H16})?::(${H16}:){4}${LS32}`;
const IPV6ADDRESS_4 = `((${H16}:){0,1}${H16})?::(${H16}:){3}${LS32}`;
const IPV6ADDRESS_5 = `((${H16}:){0,2}${H16})?::(${H16}:){2}${LS32}`;
const IPV6ADDRESS_6 = `((${H16}:){0,3}${H16})?::${H16}:${LS32}`;
const IPV6ADDRESS_7 = `((${H16}:){0,4}${H16})?::${LS32}`;
const IPV6ADDRESS_8 = `((${H16}:){0,5}${H16})?::${H16}`;
const IPV6ADDRESS_9 = `((${H16}:){0,6}${H16})?::`;
const IPV6ADDRESS = `(${IPV6ADDRESS_1}|${IPV6ADDRESS_2}|${IPV6ADDRESS_3}|${IPV6ADDRESS_4}|${IPV6ADDRESS_5}|${IPV6ADDRESS_6}|${IPV6ADDRESS_7}|${IPV6ADDRESS_8}|${IPV6ADDRESS_9})`;
// Percent-Encoding: https://tools.ietf.org/html/rfc3986#section-2.1
const PCT_ENCODED = `%${HEXDIG}${HEXDIG}`;
// Reserved Characters: https://tools.ietf.org/html/rfc3986#section-2.2
const GEN_DELIMS = `[:/?#\\[]@]`;
const SUB_DELIMS = `[!$&'()*+,;=]`;
const RESERVED = `(${GEN_DELIMS}|${SUB_DELIMS})`;
// Unreserved Characters: https://tools.ietf.org/html/rfc3986#section-2.3
const UNRESERVED = `(${ALPHA}|${DIGIT}|[-._~])`;
// Scheme: https://tools.ietf.org/html/rfc3986#section-3.1
const SCHEME = `${ALPHA}(${ALPHA}|${DIGIT}|[+\\-.])*`;
// User Information: https://tools.ietf.org/html/rfc3986#section-3.2.1
const USERINFO = `(${UNRESERVED}|${PCT_ENCODED}|${SUB_DELIMS}|:)*`;
// Host: https://tools.ietf.org/html/rfc3986#section-3.2.2
const IPVFUTURE = `v${HEXDIG}+\\.(${UNRESERVED}|${SUB_DELIMS}|:)+`;
const IP_LITERAL = `\\[(${IPV6ADDRESS}|${IPVFUTURE})]`;
const REG_NAME = `(${UNRESERVED}|${PCT_ENCODED}|${SUB_DELIMS})*`;
const HOST = `(${IP_LITERAL}|${IPV4ADDRESS}|${REG_NAME})`;
// Port: https://tools.ietf.org/html/rfc3986#section-3.2.3
const PORT = `\\d*`;
// Authority: https://tools.ietf.org/html/rfc3986#section-3.2
const AUTHORITY = `(${USERINFO}@)?${HOST}(:${PORT})?`;
// Path: https://tools.ietf.org/html/rfc3986#section-3.3
const PCHAR = `(${UNRESERVED}|${PCT_ENCODED}|${SUB_DELIMS}|[:@])`;
const SEGMENT = `(${PCHAR})*`;
const SEGMENT_NZ = `(${PCHAR})+`;
const SEGMENT_NZ_NC = `(${UNRESERVED}|${PCT_ENCODED}|${SUB_DELIMS}|@)+`; // non-zero-length segment without any colon ":"
const PATH_EMPTY = ``; // zero characters
const PATH_ROOTLESS = `${SEGMENT_NZ}(/${SEGMENT})*`; // begins with a segment
const PATH_NOSCHEME = `${SEGMENT_NZ_NC}(/${SEGMENT})*`; // begins with a non-colon segment
const PATH_ABSOLUTE = `/(${SEGMENT_NZ}(/${SEGMENT})*)?`; // begins with "/" but not "//"
const PATH_ABEMPTY = `(/${SEGMENT})*`; // begins with "/" or is empty
const PATH = `(${PATH_ABEMPTY}|${PATH_ABSOLUTE}|${PATH_NOSCHEME}|${PATH_ROOTLESS}|${PATH_EMPTY})`;
// Query: https://tools.ietf.org/html/rfc3986#section-3.4
const QUERY = `(${PCHAR}|[/?])*`;
// Fragment: https://tools.ietf.org/html/rfc3986#section-3.5
const FRAGMENT = `(${PCHAR}|[/?])*`;
// Syntax Components: https://tools.ietf.org/html/rfc3986#section-3
const HIER_PART = `(//${AUTHORITY}${PATH_ABEMPTY}|${PATH_ABSOLUTE}|${PATH_ROOTLESS}|${PATH_EMPTY})`;
const URI = `${SCHEME}:${HIER_PART}(\\?${QUERY})?(#${FRAGMENT})?`;
おまけ: HTTP/HTTPS用の正規表現
上のURIにちょっと制限をつければOK。それでも長いけどね。
ほとんどIPv4/v6が占めています。
/^https?:\/\/((([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=]|:)*@)?(\[((([0-9a-f]{1,4}:){6}([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|::([0-9a-f]{1,4}:){5}([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|([0-9a-f]{1,4})?::([0-9a-f]{1,4}:){4}([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|(([0-9a-f]{1,4}:){0,1}[0-9a-f]{1,4})?::([0-9a-f]{1,4}:){3}([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|(([0-9a-f]{1,4}:){0,2}[0-9a-f]{1,4})?::([0-9a-f]{1,4}:){2}([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|(([0-9a-f]{1,4}:){0,3}[0-9a-f]{1,4})?::[0-9a-f]{1,4}:([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|(([0-9a-f]{1,4}:){0,4}[0-9a-f]{1,4})?::([0-9a-f]{1,4}:[0-9a-f]{1,4}|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})|(([0-9a-f]{1,4}:){0,5}[0-9a-f]{1,4})?::[0-9a-f]{1,4}|(([0-9a-f]{1,4}:){0,6}[0-9a-f]{1,4})?::)|v[0-9a-f]+\.(([a-z]|[0-9]|[-._~])|[!$&'()*+,;=]|:)+)]|([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(\.([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3}|(([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=])*)(:\d*)?(\/((([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=]|[:@]))*)*(\?((([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=]|[:@])|[\/?])*)?(#((([a-z]|[0-9]|[-._~])|%[0-9a-f][0-9a-f]|[!$&'()*+,;=]|[:@])|[\/?])*)?$/i
出典
-
node-adjuster - 入力データの検証・修正を行うNode.js用の拙作ライブラリです。
- 検証用の正規表現パターンとして
adjuster.STRING.PATTERN.URI
が定義されています。 - 生成過程は
src/libs/regexp/uri.mjs
内にあります。
- 検証用の正規表現パターンとして