- # encoding:utf-8
- import time
- import re
- from functools import reduce
- # --------------------------class RePattern():---------------------------------
- # 把某字符串转化成正则表达式,再用这个正则表达式去匹配其他字符串
- class RePattern():
- def __init__(self, patternStr):
- self.pattern = '' # 存放正则表达式
- self.pre = None # 存放re.compile的返回值
- self.patternLength = len(patternStr)
- self.bCount = 1 # 计算每个block的长度
- self.block = [] # 同类型连在一起为一个block长度, ‘xXX34XXX’[1,2,2,3]
- self.blockType = [] # 例如‘xXX34XXX’[lower,upper,digit,upper]
- self.typedict = {'lower': '[a-z]{',
- 'upper': '[A-Z]{',
- 'digit': '[0-9]{',
- 'space': '[\\s]{',
- 'other': '[\\D\\W\\S]{'
- }
- self.__setPattern(patternStr)
- # 产生patternStr对应的正则表达式
- def __setPattern(self, patternStr):
- ns = []
- [ns.append(self.__toType(s)) for s in patternStr]
- # 添加一个尾巴,让__same判断
- ns.append('end')
- reduce(self.__same, ns)
- # 产生正则表达式
- for btype, blen in zip(self.blockType, self.block):
- self.pattern += self.typedict[btype]+str(blen)+'}'
- self.pre = re.compile(r''+self.pattern)
- # 把每个字符转换成对应的类型
- def __toType(self,s):
- if s.islower():
- return 'lower'
- elif s.isupper():
- return 'upper'
- elif s.isdigit():
- return 'digit'
- elif s.isspace():
- return 'space'
- else:
- return 'other'
- # 作为reduce的参数,返回第二个参数参与下次比较
- # 填充self.block 和self.blockType
- def __same(self, a, b):
- if a is b:
- self.bCount += 1
- else:
- self.block.append(self.bCount)
- self.bCount = 1
- self.blockType.append(a)
- return b
- # 参数compareStr为比较对象
- def isPattern(self, compareStr):
- tmp = self.pre.match(compareStr)
- if tmp:
- return tmp.group()
- #
- def __str__(self):
- return ' block:{0}\\n blockType:{1}\\n pattern :{2}'\\
- .format(self.block, self.blockType, self.pattern)
- # --------------------------class RePattern(): end----------
- # 从file中找出特定形式字符串
- # 以strr[start:end]的形式打印匹配字符串
- def pickFromFile(file, strr, start=None, end=None):
- count = 0
- f = open(file)
- rp = RePattern(strr)
- print(rp)
- patternLength = rp.patternLength
- for line in f:
- compareTimes = len(line)-patternLength
- for n in range(compareTimes):
- comp = rp.isPattern(line[n:n+patternLength])
- if comp:
- yield(comp[start:end], count)
- count += 1
- f.close()
- #--------------------run-------------------------
- time1 = time.time()
- strs = pickFromFile("../data.txt", 'xxx %XX00', )
- for s in strs:
- print(s)
- # 输出 'abc &BC34' 'sdf @VN03'
- time2 = time.time()
- print(time2-time1)
- #该片段来自于http://www.codesnippet.cn/detail/3107201410066.html
来源: http://www.codesnippet.cn/detail/3107201410066.html