- '''
- Created on 2012年11月28日
- @author: jiangxiaoqiang
- 本篇关于Python中的正则表达式
- '''
- #coding = utf-8
- import re
- def regexpTest():
- # match = re.search(pat, mystr)
- # All of the pattern must be matched, but not all of the string
- print("re.search():")
- mystr = 'an example word:cat!!'
- match = re.search(r'word:\\w\\w\\w', mystr) # r表示raw string,\\不会被转义
- # print(type(match)) # <class '_sre.SRE_Match'>
- if(match):
- print('Found: ' + match.group())
- else:
- print('Didn\\'t found!')
- print("-------------------------------------")
- # 邮箱验证例子
- print("邮箱验证:")
- email_regexp = r'^[\\w-]+(\\.[\\w-]+)*@([\\w-]+\\.)+[a-zA-Z]+$' # 检验某个邮箱不能省略^和$
- email = 'feichexia@yahoo.com.cn'
- email_match = re.search(email_regexp, email)
- if(email_match):
- print(email, 'is valid!')
- else:
- print(email, 'is not valid!')
- print("-------------------------------------")
- # re.findall()找到字符串中所有匹配子字符串
- print("re.findall():")
- email_regexp = r'[\\w\\.-]+@[\\w\\.-]+'
- email_str = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'
- email_matches = re.findall(email_regexp, email_str)
- print(email_matches)
- for email in email_matches:
- print(email)
- print("-------------------------------------")
- # 上面的email_matches是['alice@google.com', 'bob@abc.com']
- # 即字符串的list
- # 继续看下面的元组,为什么正则相同,字符串相同,结果不同?自己找找看有啥不同,我开始困惑了很久
- # 正则真的相同?O(∩_∩)O
- print('re.findall():元组')
- mystr = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'
- tuples = re.findall(r'([\\w\\.-]+)@([\\w\\.-]+)', mystr)
- print(tuples)
- # [('alice', 'google.com'), ('bob', 'abc.com')]
- # 即元组的list,其实是group(n)的元组
- for t in tuples:
- print('用户名:' + str(t[0]))
- print('Host: ' + str(t[1]))
- print("-------------------------------------")
- # 相信你已经发现了不同了,答案就出在圆括号上,圆括号在正则表达式中被用来分组
- # 指定正则匹配选项
- # 包括这些选项:
- # re.IGNORECASE 忽略大小写
- # re.MULTILINEWithin
- # a string made of many lines, allow ^ and $ to match the start and end of each line. Normally ^/$ would just match the start and end of the whole string.
- # re.DOTALL
- # allow dot (.) to match newline --
- # normally it matches anything but newline.
- # This can trip you up -- you think .* matches everything,
- # but by default it does not go past the end of a line.
- # Note that \\s (whitespace) includes newlines,
- # so if you want to match a run of whitespace that may
- # include a newline, you can just use \\s*
- print("带额外匹配选项的正则匹配:")
- multistr = '''Foo None what bar
- Not know universe
- True nong'''
- regexp = r'no'
- matches = re.findall(regexp, multistr, re.IGNORECASE)
- print("共匹配个数:" + str(len(matches)))
- print("-------------------------------------")
- print("高级匹配规则之正向预搜索与反向预搜索:")
- print("正向预搜索(或者正向预查),后面必须是:")
- str2 = 'none know no'
- regexp2 = r'no(?=w)' # no后面必须是w才匹配,括号中的内容不会被捕获
- matches2 = re.findall(regexp2, str2)
- for m in matches2:
- print(m)
- print("-------------------------------------")
- print('正向预搜索(或者正向预查),后面必须不是:')
- regexp3 = r'no(?!w)' # no后面不是w才匹配,括号中的内容不会被捕获
- matches3 = re.findall(regexp3, str2)
- for m in matches3:
- print(m)
- print("-------------------------------------")
- print('负向预搜索(或者负向预查),前面必须是:')
- regexp4 = r'(?<=k)no' # no前面必须是k才匹配,括号中的内容不会被捕获
- matches4 = re.findall(regexp4, str2)
- for m in matches4:
- print(m)
- print("-------------------------------------")
- print('负向预搜索(或者负向预查),前面必须不是:')
- regexp5 = r'(?<!k)no' # no前面必须不是k才匹配,括号中的内容不会被捕获
- matches5 = re.findall(regexp5, str2)
- for m in matches5:
- print(m)
- print("-------------------------------------")
- # 贪婪匹配与非贪婪匹配
- print("高级匹配规则之贪婪匹配与非贪婪匹配:")
- # Google Python教程关于这个有一段比较清晰的阐述如下:(我翻译的)
- '''
- 假如你想匹配每个html标签,HTML内容是: <b>foo</b> and <i>so on</i>
- 你想用'(<.*>)'来匹配每个HTML标签。那么它将首先匹配什么呢?
- 结果有点出乎意料,因为 .* 是贪婪匹配,它将匹配 b>foo</b> and <i>so on</i>
- 有一种方法可以让匹配是非贪婪的,那就是在后面加上 ?,比如 .*? 和 .+?,这样就使得它们是
- 非贪 婪的。
- 所以'(<.*?>)'将第一个匹配'<b>',第二个匹配'</b>'...
- 匹配过程简单来说是这样的:找到'<'之后,马上向后查找是否有'>'。
- *?最早来源于Perl。支持Perl正则语法的正则表达式称为Perl兼容正则。
- 如果不使用非贪婪匹配,还有另外一种解决方案,使用中括号。比如上面的例子可以用这个正则来完成:
- '(<[^>]*?>)'
- 这也是一种经典做法!
- '''
- print("非贪婪匹配做法:")
- html_str = '<b>foo</b> and <i>bar</i>'
- regex_tag = '<.*?>'
- tags = re.findall(regex_tag, html_str)
- for t in tags:
- print(t)
- print("-------------------------------------")
- print('中括号做法:')
- regex_tag2 = '<[^>]*?>'
- tags2 = re.findall(regex_tag2, html_str)
- for t in tags2:
- print(t)
- print("-------------------------------------")
- # re.sub(pattern, replacement, str)
- # str中所有与pattern匹配的子串都被replacement替换
- # replacement可以包含 '\\1', '\\2',表示对分组的引用
- # 即分别表示group(1), group(2)
- print("re.sub()用法:")
- print(re.sub(r'(\\w+), (\\w+)', r'\\2, \\1', "John, Smith")) # Smith, John
- def main():
- regexpTest()
- if __name__ == '__main__':
- main()
- #该片段来自于http://www.codesnippet.cn/detail/180720134679.html
来源: http://www.codesnippet.cn/detail/180720134679.html