Learn Python By Practice ― 正则表达式

 
'''
Created on 2012年11月28日
@author: jiangxiaoqiang
本篇关于Python中的正则表达式
'''
#coding = utf-8
import re
 
def regexpTest():
    # match = re.search(pat, mystr)
    # All of the pattern must be matched, but not all of the string
    print("re.search()：")
    mystr = 'an example word:cat!!'
    match = re.search(r'word:\\w\\w\\w', mystr) # r表示raw string，\\不会被转义
    # print(type(match)) # <class '_sre.SRE_Match'>
    if(match):
        print('Found: ' + match.group())
    else:
        print('Didn\\'t found!')
    print("-------------------------------------")
         
         
         
    # 邮箱验证例子
    print("邮箱验证：")
    email_regexp = r'^[\\w-]+(\\.[\\w-]+)*@([\\w-]+\\.)+[a-zA-Z]+$' # 检验某个邮箱不能省略^和$
    email = 'feichexia@yahoo.com.cn'
    email_match = re.search(email_regexp, email)
    if(email_match):
        print(email, 'is valid!')
    else:
        print(email, 'is not valid!')
    print("-------------------------------------")
     
     
     
    # re.findall()找到字符串中所有匹配子字符串
    print("re.findall()：")
    email_regexp = r'[\\w\\.-]+@[\\w\\.-]+'
    email_str = 'purple alice@google.com， blah monkey bob@abc.com blah dishwasher'
    email_matches = re.findall(email_regexp, email_str)
    print(email_matches)
    for email in email_matches:
        print(email)
    print("-------------------------------------")
    # 上面的email_matches是['alice@google.com', 'bob@abc.com']
    # 即字符串的list
    # 继续看下面的元组，为什么正则相同，字符串相同，结果不同？自己找找看有啥不同，我开始困惑了很久
    # 正则真的相同？O(∩_∩)O
     
     
    print('re.findall()：元组')
    mystr = 'purple alice@google.com， blah monkey bob@abc.com blah dishwasher'
    tuples = re.findall(r'([\\w\\.-]+)@([\\w\\.-]+)', mystr)
    print(tuples)  
    # [('alice', 'google.com'), ('bob', 'abc.com')]
    # 即元组的list，其实是group(n)的元组
    for t in tuples:
        print('用户名:' + str(t[0]))
        print('Host: ' + str(t[1]))
    print("-------------------------------------")
    # 相信你已经发现了不同了，答案就出在圆括号上，圆括号在正则表达式中被用来分组
     
     
    # 指定正则匹配选项
    # 包括这些选项：
    # re.IGNORECASE 忽略大小写
    # re.MULTILINEWithin 
    #    a string made of many lines, allow ^ and $ to match the start and end of each line. Normally ^/$ would just match the start and end of the whole string.
    # re.DOTALL 
    #    allow dot (.) to match newline -- 
    #    normally it matches anything but newline. 
    #    This can trip you up -- you think .* matches everything, 
    #    but by default it does not go past the end of a line. 
    #    Note that \\s (whitespace) includes newlines, 
    #    so if you want to match a run of whitespace that may 
    #    include a newline, you can just use \\s*
    print("带额外匹配选项的正则匹配：")
    multistr = '''Foo None what bar
    Not know universe
    True nong'''
    regexp = r'no'
    matches = re.findall(regexp, multistr, re.IGNORECASE)
    print("共匹配个数：" + str(len(matches)))
    print("-------------------------------------")
     
     
    print("高级匹配规则之正向预搜索与反向预搜索：")
    print("正向预搜索（或者正向预查），后面必须是：")
    str2 = 'none know no'
    regexp2 = r'no(?=w)' # no后面必须是w才匹配，括号中的内容不会被捕获
    matches2 = re.findall(regexp2, str2)
    for m in matches2:
        print(m)
    print("-------------------------------------")
         
    print('正向预搜索（或者正向预查），后面必须不是：')
    regexp3 = r'no(?!w)' # no后面不是w才匹配，括号中的内容不会被捕获
    matches3 = re.findall(regexp3, str2)
    for m in matches3:
        print(m)
    print("-------------------------------------")
     
    print('负向预搜索（或者负向预查），前面必须是：')
    regexp4 = r'(?<=k)no' # no前面必须是k才匹配，括号中的内容不会被捕获
    matches4 = re.findall(regexp4, str2)
    for m in matches4:
        print(m)
    print("-------------------------------------")
     
    print('负向预搜索（或者负向预查），前面必须不是：')
    regexp5 = r'(?<!k)no' # no前面必须不是k才匹配，括号中的内容不会被捕获
    matches5 = re.findall(regexp5, str2)
    for m in matches5:
        print(m)
    print("-------------------------------------")
     
     
    # 贪婪匹配与非贪婪匹配
    print("高级匹配规则之贪婪匹配与非贪婪匹配：")
    # Google Python教程关于这个有一段比较清晰的阐述如下：（我翻译的）
    '''
            假如你想匹配每个html标签，HTML内容是: <b>foo</b> and <i>so on</i>
            你想用'(<.*>)'来匹配每个HTML标签。那么它将首先匹配什么呢？
           结果有点出乎意料，因为 .* 是贪婪匹配，它将匹配 b>foo</b> and <i>so on</i>
           有一种方法可以让匹配是非贪婪的，那就是在后面加上 ?，比如 .*? 和 .+?，这样就使得它们是
           非贪 婪的。
           所以'(<.*?>)'将第一个匹配'<b>'，第二个匹配'</b>'...
           匹配过程简单来说是这样的：找到'<'之后，马上向后查找是否有'>'。
    *?最早来源于Perl。支持Perl正则语法的正则表达式称为Perl兼容正则。
          如果不使用非贪婪匹配，还有另外一种解决方案，使用中括号。比如上面的例子可以用这个正则来完成：
   '(<[^>]*?>)'
         这也是一种经典做法！
    '''
    print("非贪婪匹配做法：")
    html_str = '<b>foo</b> and <i>bar</i>'
    regex_tag = '<.*?>'
    tags = re.findall(regex_tag, html_str)
    for t in tags:
        print(t)
    print("-------------------------------------")
     
    print('中括号做法：')
    regex_tag2 = '<[^>]*?>'
    tags2 = re.findall(regex_tag2, html_str)
    for t in tags2:
        print(t)
    print("-------------------------------------")
     
     
    # re.sub(pattern, replacement, str) 
    # str中所有与pattern匹配的子串都被replacement替换
    # replacement可以包含 '\\1', '\\2'，表示对分组的引用
    # 即分别表示group(1), group(2)
    print("re.sub()用法：")
    print(re.sub(r'(\\w+), (\\w+)', r'\\2, \\1', "John, Smith")) # Smith, John
     
def main():
    regexpTest()
 
if __name__ == '__main__':
    main()
#该片段来自于http://www.codesnippet.cn/detail/180720134679.html
来源: http://www.codesnippet.cn/detail/180720134679.html
与本文相关文章

暂无,快来抢沙发吧！