我正在寻找最终的邮政编码和邮政编码正则表达式。我正在寻找一种能覆盖世界大部分地区(希望是所有地区)的东西。
当前回答
请注意,这是一个相当难的问题,正如公认的答案所述。 我想这并没有阻止geonames.org的人们。 他们有一个文件,一个国家信息文件,这并不完全符合这个答案-限制是30000字符显然。有大约150个国家的正则表达式。
我在这里提取了与这个问题相关的部分:
AD ^(?:AD)*(\d{3})$
AM ^(\d{6})$
AR ^([A-Z]\d{4}[A-Z]{3})$
AT ^(\d{4})$
AU ^(\d{4})$
AX ^(?:FI)*(\d{5})$
AZ ^(?:AZ)*(\d{4})$
BA ^(\d{5})$
BB ^(?:BB)*(\d{5})$
BD ^(\d{4})$
BE ^(\d{4})$
BG ^(\d{4})$
BH ^(\d{3}\d?)$
BM ^([A-Z]{2}\d{2})$
BN ^([A-Z]{2}\d{4})$
BR ^(\d{8})$
BY ^(\d{6})$
CA ^([ABCEGHJKLMNPRSTVXY]\d[ABCEGHJKLMNPRSTVWXYZ]) ?(\d[ABCEGHJKLMNPRSTVWXYZ]\d)$
CH ^(\d{4})$
CL ^(\d{7})$
CN ^(\d{6})$
CR ^(\d{4})$
CU ^(?:CP)*(\d{5})$
CV ^(\d{4})$
CX ^(\d{4})$
CY ^(\d{4})$
CZ ^(\d{5})$
DE ^(\d{5})$
DK ^(\d{4})$
DO ^(\d{5})$
DZ ^(\d{5})$
EC ^([a-zA-Z]\d{4}[a-zA-Z])$
EE ^(\d{5})$
EG ^(\d{5})$
ES ^(\d{5})$
ET ^(\d{4})$
FI ^(?:FI)*(\d{5})$
FM ^(\d{5})$
FO ^(?:FO)*(\d{3})$
FR ^(\d{5})$
GB ^(([A-Z]\d{2}[A-Z]{2})|([A-Z]\d{3}[A-Z]{2})|([A-Z]{2}\d{2}[A-Z]{2})|([A-Z]{2}\d{3}[A-Z]{2})|([A-Z]\d[A-Z]\d[A-Z]{2})|([A-Z]{2}\d[A-Z]\d[A-Z]{2})|(GIR0AA))$
GE ^(\d{4})$
GF ^((97|98)3\d{2})$
GG ^(([A-Z]\d{2}[A-Z]{2})|([A-Z]\d{3}[A-Z]{2})|([A-Z]{2}\d{2}[A-Z]{2})|([A-Z]{2}\d{3}[A-Z]{2})|([A-Z]\d[A-Z]\d[A-Z]{2})|([A-Z]{2}\d[A-Z]\d[A-Z]{2})|(GIR0AA))$
GL ^(\d{4})$
GP ^((97|98)\d{3})$
GR ^(\d{5})$
GT ^(\d{5})$
GU ^(969\d{2})$
GW ^(\d{4})$
HN ^([A-Z]{2}\d{4})$
HR ^(?:HR)*(\d{5})$
HT ^(?:HT)*(\d{4})$
HU ^(\d{4})$
ID ^(\d{5})$
IL ^(\d{5})$
IM ^(([A-Z]\d{2}[A-Z]{2})|([A-Z]\d{3}[A-Z]{2})|([A-Z]{2}\d{2}[A-Z]{2})|([A-Z]{2}\d{3}[A-Z]{2})|([A-Z]\d[A-Z]\d[A-Z]{2})|([A-Z]{2}\d[A-Z]\d[A-Z]{2})|(GIR0AA))$
IN ^(\d{6})$
IQ ^(\d{5})$
IR ^(\d{10})$
IS ^(\d{3})$
IT ^(\d{5})$
JE ^(([A-Z]\d{2}[A-Z]{2})|([A-Z]\d{3}[A-Z]{2})|([A-Z]{2}\d{2}[A-Z]{2})|([A-Z]{2}\d{3}[A-Z]{2})|([A-Z]\d[A-Z]\d[A-Z]{2})|([A-Z]{2}\d[A-Z]\d[A-Z]{2})|(GIR0AA))$
JO ^(\d{5})$
JP ^(\d{7})$
KE ^(\d{5})$
KG ^(\d{6})$
KH ^(\d{5})$
KP ^(\d{6})$
KR ^(?:SEOUL)*(\d{6})$
KW ^(\d{5})$
KZ ^(\d{6})$
LA ^(\d{5})$
LB ^(\d{4}(\d{4})?)$
LI ^(\d{4})$
LK ^(\d{5})$
LR ^(\d{4})$
LS ^(\d{3})$
LT ^(?:LT)*(\d{5})$
LU ^(\d{4})$
LV ^(?:LV)*(\d{4})$
MA ^(\d{5})$
MC ^(\d{5})$
MD ^(?:MD)*(\d{4})$
ME ^(\d{5})$
MG ^(\d{3})$
MK ^(\d{4})$
MM ^(\d{5})$
MN ^(\d{6})$
MQ ^(\d{5})$
MT ^([A-Z]{3}\d{2}\d?)$
MV ^(\d{5})$
MX ^(\d{5})$
MY ^(\d{5})$
MZ ^(\d{4})$
NC ^(\d{5})$
NE ^(\d{4})$
NF ^(\d{4})$
NG ^(\d{6})$
NI ^(\d{7})$
NL ^(\d{4}[A-Z]{2})$
NO ^(\d{4})$
NP ^(\d{5})$
NZ ^(\d{4})$
OM ^(\d{3})$
PF ^((97|98)7\d{2})$
PG ^(\d{3})$
PH ^(\d{4})$
PK ^(\d{5})$
PL ^(\d{5})$
PM ^(97500)$
PR ^(\d{9})$
PT ^(\d{7})$
PW ^(96940)$
PY ^(\d{4})$
RE ^((97|98)(4|7|8)\d{2})$
RO ^(\d{6})$
RS ^(\d{6})$
RU ^(\d{6})$
SA ^(\d{5})$
SD ^(\d{5})$
SE ^(?:SE)*(\d{5})$
SG ^(\d{6})$
SH ^(STHL1ZZ)$
SI ^(?:SI)*(\d{4})$
SK ^(\d{5})$
SM ^(4789\d)$
SN ^(\d{5})$
SO ^([A-Z]{2}\d{5})$
SV ^(?:CP)*(\d{4})$
SZ ^([A-Z]\d{3})$
TC ^(TKCA 1ZZ)$
TH ^(\d{5})$
TJ ^(\d{6})$
TM ^(\d{6})$
TN ^(\d{4})$
TR ^(\d{5})$
TW ^(\d{5})$
UA ^(\d{5})$
US ^\d{5}(-\d{4})?$
UY ^(\d{5})$
UZ ^(\d{6})$
VA ^(\d{5})$
VE ^(\d{4})$
VI ^\d{5}(-\d{4})?$
VN ^(\d{6})$
WF ^(986\d{2})$
YT ^(\d{5})$
ZA ^(\d{4})$
ZM ^(\d{5})$
CS ^(\d{5})$
希望我没有犯错,我的regex-fu很弱。
其他回答
你为什么要这么做,你为什么这么在乎?正如Tom Ritter所指出的,你是否有一个ZIP/邮政编码并不重要,更不用说它是否有效了,除非你真的要把东西发送到那个地址。即使你希望有一天你会给他们寄东西,这并不意味着你今天就需要邮政编码。
unicode CLDR包含每个国家的邮政编码正则表达式。(总共158个正则表达式!)
从http://unicode.org/Public/cldr/26.0.1/下载core.zip 解压缩core.zip 从解压缩的内容(直接内容:common/ supplementary /postalCodeData.xml)中查看common/ supplementary /postalCodeData.xml
谷歌还有一个web服务,提供每个国家的地址格式信息,包括邮政编码,网址是http://i18napis.appspot.com/address (我通过http://unicode.org/review/pri180/找到了这个链接)
编辑
这里是postalCodeData.xml正则表达式的副本:
"GB", "GIR[ ]?0AA|((AB|AL|B|BA|BB|BD|BH|BL|BN|BR|BS|BT|CA|CB|CF|CH|CM|CO|CR|CT|CV|CW|DA|DD|DE|DG|DH|DL|DN|DT|DY|E|EC|EH|EN|EX|FK|FY|G|GL|GY|GU|HA|HD|HG|HP|HR|HS|HU|HX|IG|IM|IP|IV|JE|KA|KT|KW|KY|L|LA|LD|LE|LL|LN|LS|LU|M|ME|MK|ML|N|NE|NG|NN|NP|NR|NW|OL|OX|PA|PE|PH|PL|PO|PR|RG|RH|RM|S|SA|SE|SG|SK|SL|SM|SN|SO|SP|SR|SS|ST|SW|SY|TA|TD|TF|TN|TQ|TR|TS|TW|UB|W|WA|WC|WD|WF|WN|WR|WS|WV|YO|ZE)(\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}))|BFPO[ ]?\d{1,4}"
"JE", "JE\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}"
"GG", "GY\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}"
"IM", "IM\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}"
"US", "\d{5}([ \-]\d{4})?"
"CA", "[ABCEGHJKLMNPRSTVXY]\d[ABCEGHJ-NPRSTV-Z][ ]?\d[ABCEGHJ-NPRSTV-Z]\d"
"DE", "\d{5}"
"JP", "\d{3}-\d{4}"
"FR", "\d{2}[ ]?\d{3}"
"AU", "\d{4}"
"IT", "\d{5}"
"CH", "\d{4}"
"AT", "\d{4}"
"ES", "\d{5}"
"NL", "\d{4}[ ]?[A-Z]{2}"
"BE", "\d{4}"
"DK", "\d{4}"
"SE", "\d{3}[ ]?\d{2}"
"NO", "\d{4}"
"BR", "\d{5}[\-]?\d{3}"
"PT", "\d{4}([\-]\d{3})?"
"FI", "\d{5}"
"AX", "22\d{3}"
"KR", "\d{3}[\-]\d{3}"
"CN", "\d{6}"
"TW", "\d{3}(\d{2})?"
"SG", "\d{6}"
"DZ", "\d{5}"
"AD", "AD\d{3}"
"AR", "([A-HJ-NP-Z])?\d{4}([A-Z]{3})?"
"AM", "(37)?\d{4}"
"AZ", "\d{4}"
"BH", "((1[0-2]|[2-9])\d{2})?"
"BD", "\d{4}"
"BB", "(BB\d{5})?"
"BY", "\d{6}"
"BM", "[A-Z]{2}[ ]?[A-Z0-9]{2}"
"BA", "\d{5}"
"IO", "BBND 1ZZ"
"BN", "[A-Z]{2}[ ]?\d{4}"
"BG", "\d{4}"
"KH", "\d{5}"
"CV", "\d{4}"
"CL", "\d{7}"
"CR", "\d{4,5}|\d{3}-\d{4}"
"HR", "\d{5}"
"CY", "\d{4}"
"CZ", "\d{3}[ ]?\d{2}"
"DO", "\d{5}"
"EC", "([A-Z]\d{4}[A-Z]|(?:[A-Z]{2})?\d{6})?"
"EG", "\d{5}"
"EE", "\d{5}"
"FO", "\d{3}"
"GE", "\d{4}"
"GR", "\d{3}[ ]?\d{2}"
"GL", "39\d{2}"
"GT", "\d{5}"
"HT", "\d{4}"
"HN", "(?:\d{5})?"
"HU", "\d{4}"
"IS", "\d{3}"
"IN", "\d{6}"
"ID", "\d{5}"
"IL", "\d{5}"
"JO", "\d{5}"
"KZ", "\d{6}"
"KE", "\d{5}"
"KW", "\d{5}"
"LA", "\d{5}"
"LV", "\d{4}"
"LB", "(\d{4}([ ]?\d{4})?)?"
"LI", "(948[5-9])|(949[0-7])"
"LT", "\d{5}"
"LU", "\d{4}"
"MK", "\d{4}"
"MY", "\d{5}"
"MV", "\d{5}"
"MT", "[A-Z]{3}[ ]?\d{2,4}"
"MU", "(\d{3}[A-Z]{2}\d{3})?"
"MX", "\d{5}"
"MD", "\d{4}"
"MC", "980\d{2}"
"MA", "\d{5}"
"NP", "\d{5}"
"NZ", "\d{4}"
"NI", "((\d{4}-)?\d{3}-\d{3}(-\d{1})?)?"
"NG", "(\d{6})?"
"OM", "(PC )?\d{3}"
"PK", "\d{5}"
"PY", "\d{4}"
"PH", "\d{4}"
"PL", "\d{2}-\d{3}"
"PR", "00[679]\d{2}([ \-]\d{4})?"
"RO", "\d{6}"
"RU", "\d{6}"
"SM", "4789\d"
"SA", "\d{5}"
"SN", "\d{5}"
"SK", "\d{3}[ ]?\d{2}"
"SI", "\d{4}"
"ZA", "\d{4}"
"LK", "\d{5}"
"TJ", "\d{6}"
"TH", "\d{5}"
"TN", "\d{4}"
"TR", "\d{5}"
"TM", "\d{6}"
"UA", "\d{5}"
"UY", "\d{5}"
"UZ", "\d{6}"
"VA", "00120"
"VE", "\d{4}"
"ZM", "\d{5}"
"AS", "96799"
"CC", "6799"
"CK", "\d{4}"
"RS", "\d{6}"
"ME", "8\d{4}"
"CS", "\d{5}"
"YU", "\d{5}"
"CX", "6798"
"ET", "\d{4}"
"FK", "FIQQ 1ZZ"
"NF", "2899"
"FM", "(9694[1-4])([ \-]\d{4})?"
"GF", "9[78]3\d{2}"
"GN", "\d{3}"
"GP", "9[78][01]\d{2}"
"GS", "SIQQ 1ZZ"
"GU", "969[123]\d([ \-]\d{4})?"
"GW", "\d{4}"
"HM", "\d{4}"
"IQ", "\d{5}"
"KG", "\d{6}"
"LR", "\d{4}"
"LS", "\d{3}"
"MG", "\d{3}"
"MH", "969[67]\d([ \-]\d{4})?"
"MN", "\d{6}"
"MP", "9695[012]([ \-]\d{4})?"
"MQ", "9[78]2\d{2}"
"NC", "988\d{2}"
"NE", "\d{4}"
"VI", "008(([0-4]\d)|(5[01]))([ \-]\d{4})?"
"PF", "987\d{2}"
"PG", "\d{3}"
"PM", "9[78]5\d{2}"
"PN", "PCRN 1ZZ"
"PW", "96940"
"RE", "9[78]4\d{2}"
"SH", "(ASCN|STHL) 1ZZ"
"SJ", "\d{4}"
"SO", "\d{5}"
"SZ", "[HLMS]\d{3}"
"TC", "TKCA 1ZZ"
"WF", "986\d{2}"
"XK", "\d{5}"
"YT", "976\d{2}"
如果有人仍然对如何验证邮政编码感兴趣,我找到了一个解决方案:
使用谷歌地理编码API,我们可以检查具有国家代码和邮政编码本身的邮政编码的有效性。
例如,我住在乌克兰,所以我可以这样检查: https://maps.googleapis.com/maps/api/geocode/json?components=postal_code:80380|country:UA
或者使用JS API: https://developers.google.com/maps/documentation/javascript/geocoding#ComponentFiltering
其中80380是乌克兰有效的ZIP,实际上每个(#####)是有效的。
如果没有发现,谷歌返回ZERO_RESULTS状态。 或者OK和一个结果,如果两者都正确。
希望这对你有所帮助。
有人问关于格式化邮件地址的列表,我想这就是他要找的…
弗兰克的邮政地址强制指南:http://www.columbia.edu/~fdc/postal/ 但这对解决街头问题没什么帮助。
我的工作使用了一些工具来帮助实现这一点: - Lexis-Nexis服务,包括NCOA查询(您将“免费”获得地址标准化) -“梅丽莎数据”http://www.melissadata.com
我知道这是一个老问题,但我无意中遇到了同样的问题。 我有来自100多个国家的发票,并试图得到正确的债权人在zip(如果每其他检查失败)。 所以我所做的就是写一个简短的Python脚本,从一个字符串创建一个模式:
class RegexPatternBuilder:
"""
Builds a regex pattern out of a given string(i.e. --> HM452 AX2155 : [A-Z]{2}\d{3}\s{1}[A-Z]{2}\d{4})
"""
__is_alpha_count = 0
__is_numeric_count = 0
__is_whitespace_count = 0
__pattern = ""
# Count: wich character of the string we're locking at right now
__count = 0
# Countrys like Andora starts theire ZIP with the country abbreviation :AD500
# So check at first if the ZIP starts with the abbreviation and if so, add it to the pattern and increase the count.
def __init__(self, zip_string, country):
self.__zip_string = zip_string
self.__country = country
if self.__zip_string.startswith(country):
self.__pattern = f'({self.__country})'
self.__count += len(self.__country)
def build_regex(self):
# Last step ;
# Add the current alpha_numeric pattern with count
if len(self.__zip_string) == self.__count:
if self.__is_alpha_count:
self.__pattern += f"[A-Z]{{{self.__is_alpha_count}}}"
if self.__is_numeric_count:
self.__pattern += f"\d{{{self.__is_numeric_count}}}"
return f'{self.__pattern}\\b'
# Case: Whitespace
# Check if there is a crossing from numeric / alphanumeric to whitespace,
# if so --> add the alpha_numeric regex to the whole pattern with the
# count as the number of viable appeaerances.
# Since there is max 1 whitespace in a ZIP, add the whitespace regex immediately.
# Every other case is similar to that.
if self.__zip_string[self.__count].isspace():
if self.__is_numeric_count:
self.__pattern += f"\d{{{self.__is_numeric_count}}}"
if self.__is_alpha_count:
self.__pattern += f"[A-Z]{{{self.__is_alpha_count}}}"
self.__pattern += "\s{1}"
self.__is_whitespace_count += 1
self.__is_alpha_count = 0
self.__is_numeric_count = 0
# Case: Is Alphanumeric
if self.__zip_string[self.__count].isalpha():
if self.__is_numeric_count:
self.__pattern += f"[0-9]{{{self.__is_numeric_count}}}"
self.__is_whitespace_count = 0
self.__is_alpha_count += 1
self.__is_numeric_count = 0
# Case: Is Numeric
if self.__zip_string[self.__count].isnumeric():
if self.__is_alpha_count:
self.__pattern += f"[A-Z]{{{self.__is_alpha_count}}}"
self.__is_whitespace_count = 0
self.__is_alpha_count = 0
self.__is_numeric_count += 1
# Case: Special Character (i.e. - )
# No escaping or count for this so far, because it shouldn't be needed for our zip purposes
if not self.__zip_string[self.__count].isalpha() \
and not self.__zip_string[self.__count].isnumeric() \
and not self.__zip_string[self.__count].isspace():
self.__pattern += f'{self.__zip_string[self.__count]}{{1}}'
self.__count += 1
return self.build_regex()
有了这个,我创建了所有不同的可能的正则表达式的所有拉链(按国家),我们历史上写回一个db表(即这样的东西在最后: 国家:RE PATTERN:(\d{5})\b[这可能是什么国家;d])
也许它能帮助别人。