一、格式说明
“|” 表示隔开的两部分只能出现其中一个,方括号[]括起来的表示可选,文字由双引号””括起来,以n*开头表示至少n个或更多的,n的缺省值为0。
二、总体格式
genericurl = scheme “:” schemepart
scheme = 1*[ lowalpha | digit | “+” | “-” | “.” ]
schemepart = *xchar | ip-schemepart
ip-schemepart = “//” login [ “/” urlpath ]
login = [ user [ “:” password ] “@” ] hostport
hostport = host [ “:” port ]
host = hostname | hostnumber
hostname = *[ domainlabel “.” ] toplabel
domainlabel = alphadigit | alphadigit *[ alphadigit | “-” ] alphadigit
toplabel = alpha | alpha *[ alphadigit | “-” ] alphadigit
alphadigit = alpha | digit
hostnumber = digits “.” digits “.” digits “.” digits
port = digits
user = *[ uchar | “;” | “?” | “&” | “=” ]
password = *[ uchar | “;” | “?” | “&” | “=” ]
urlpath = *xchar ; depends on protocol see section 3.1
三、常见scheme
; FTP (参见RFC959)
ftpurl = “ftp://” login [ “/” fpath [ “;type=” ftptype ]]
fpath = fsegment *[ “/” fsegment ]
fsegment = *[ uchar | “?” | “:” | “@” | “&” | “=” ]
ftptype = “A” | “I” | “D” | “a” | “i” | “d”
; FILE
fileurl = “file://” [ host | “localhost” ] “/” fpath
; HTTP
httpurl = “http://” hostport [ “/” hpath [ “?” search ]]
hpath = hsegment *[ “/” hsegment ]
hsegment = *[ uchar | “;” | “:” | “@” | “&” | “=” ]
search = *[ uchar | “;” | “:” | “@” | “&” | “=” ]
; GOPHER (参见RFC1436)
gopherurl = “gopher://” hostport [ / [ gtype [ selector
[ “%09″ search [ “%09″ gopher+_string ] ] ] ] ]
gtype = xchar
selector = *xchar
gopher+_string = *xchar
; MAILTO (参见 RFC822)
mailtourl = “mailto:” encoded822addr
encoded822addr = 1*xchar ; further defined in RFC822
; NEWS (参见 RFC1036)
newsurl = “news:” grouppart
grouppart = “*” | group | article
group = alpha *[ alpha | digit | “-” | “.” | “+” | “_” ]
article = 1*[ uchar | “;” | “/” | “?” | “:” | “&” | “=” ] “@” host
; NNTP (参见RFC977)
nntpurl = “nntp://” hostport “/” group [ “/” digits ]
; TELNET
telneturl = “telnet://” login [ “/” ]
; WAIS (参见 RFC1625)
waisurl = waisdatabase | waisindex | waisdoc
waisdatabase = “wais://” hostport “/” database
waisindex = “wais://” hostport “/” database “?” search
waisdoc = “wais://” hostport “/” database “/” wtype “/” wpath
database = *uchar
wtype = *uchar
wpath = *uchar
; PROSPERO
prosperourl = “prospero://” hostport “/” ppath *[ fieldspec ]
ppath = psegment *[ “/” psegment ]
psegment = *[ uchar | “?” | “:” | “@” | “&” | “=” ]
fieldspec = “;” fieldname “=” fieldvalue
fieldname = *[ uchar | “?” | “:” | “@” | “&” ]
fieldvalue = *[ uchar | “?” | “:” | “@” | “&” ]
四、杂类
lowalpha = “a” | “b” | “c” | “d” | “e” | “f” | “g” | “h” |
“i” | “j” | “k” | “l” | “m” | “n” | “o” | “p” |
“q” | “r” | “s” | “t” | “u” | “v” | “w” | “x” |
“y” | “z”
hialpha = “A” | “B” | “C” | “D” | “E” | “F” | “G” | “H” | “I” |
“J” | “K” | “L” | “M” | “N” | “O” | “P” | “Q” | “R” |
“S” | “T” | “U” | “V” | “W” | “X” | “Y” | “Z”
alpha = lowalpha | hialpha
digit = “0″ | “1″ | “2″ | “3″ | “4″ | “5″ | “6″ | “7″ |
“8″ | “9″
safe = “$” | “-” | “_” | “.” | “+”
extra = “!” | “*” | “‘” | “(” | “)” | “,”
national = “{” | “}” | “|” | “\” | “^” | “~” | “[” | “]” | “`”
punctuation = “<” | “>” | “#” | “%” | <”>
reserved = “;” | “/” | “?” | “:” | “@” | “&” | “=”
hex = digit | “A” | “B” | “C” | “D” | “E” | “F” |
“a” | “b” | “c” | “d” | “e” | “f”
escape = “%” hex hex
unreserved = alpha | digit | safe | extra
uchar = unreserved | escape
xchar = unreserved | reserved | escape
digits = 1*digit
转自:http://dancewithnet.com/2007/01/30/url-canonicalization/
posted on 2008-08-19 00:13
merlinfang 阅读(723)
评论(0) 编辑 收藏 引用 所属分类:
搜索引擎