NLTK 内置词性标注器
用 nltk.pos_tag()函数进行词性标注
- import nltk
- nltk.download('averaged_perceptron_tagger')
- simpleSentence = 'Bangalore is the capital of Karnataka.'
- # 分词
- wordsInSentence = nltk.word_tokenize(simpleSentence)
- print(wordsInSentence)
- # 词性标注
- partsOfSpeechTags = nltk.pos_tag(wordsInSentence)
- print(partsOfSpeechTags)
输出:
- ['Bangalore', 'is', 'the', 'capital', 'of', 'Karnataka', '.']
- [('Bangalore', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('capital', 'NN'), ('of', 'IN'), ('Karnataka', 'NNP'), ('.', '.')]
自己的词性标注器
- import nltk
- # 默认: 不认识的都标成 NN
- def learnDefaultTagger(simpleSentence):
- wordsInSentence = nltk.word_tokenize(simpleSentence)
- tagger = nltk.DefaultTagger('NN')
- posEnabledTags = tagger.tag(wordsInSentence)
- print(posEnabledTags)
- # 正则表达式标注
- def learnRETagger(simpleSentence):
- # 元组列表, r 不能省哦
- customPatterns =[
- (r'.*ing$','ADJECTIVE'),
- (r'.*ly$','ADVERB'),
- (r'.*ion$','NOUN'),
- (r'(.*ate|.*en|is)$','VERB'),
- (r'^an$','INDEFINITE-ARTICLE'),
- (r'^(with|on|at)$','PREPOSITION'),
- (r'^[0-9]*$','NUMBER'),
- (r'.*$',None),
- ]
- tagger = nltk.RegexpTagger(customPatterns)
- wordsInSentencs = nltk.word_tokenize(simpleSentence)
- posEnabledTags =tagger.tag(wordsInSentencs)
- print(posEnabledTags)
- # 字典标注
- def learnLookupTagger(simpleSentence):
- mapping = {
- '.':'.','place':'NN','on':'IN','earth':'NN','Mysore':'NNP',
- 'is':'VBZ','an':'DT','amazing':'JJ',
- }
- tagger = nltk.UnigramTagger(model=mapping)
- wordsInSentencs = nltk.word_tokenize(simpleSentence)
- posEnabledTags = tagger.tag(wordsInSentencs)
- print(posEnabledTags)
- if __name__ == "__main__":
- testSentence = 'Mysore is an amazing place on earth. I have visited Mysore 10 times.'
- learnDefaultTagger(testSentence)
- learnRETagger(testSentence)
- learnLookupTagger(testSentence)
输出:
- [('Mysore', 'NN'), ('is', 'NN'), ('an', 'NN'), ('amazing', 'NN'), ('place', 'NN'), ('on', 'NN'), ('earth', 'NN'), ('.', 'NN'), ('I', 'NN'), ('have', 'NN'), ('visited', 'NN'), ('Mysore', 'NN'), ('10', 'NN'), ('times', 'NN'), ('.', 'NN')]
- [('Mysore', None), ('is', 'VERB'), ('an', 'INDEFINITE-ARTICLE'), ('amazing', 'ADJECTIVE'), ('place', None), ('on', 'PREPOSITION'), ('earth', None), ('.', None), ('I', None), ('have', None), ('visited', None), ('Mysore', None), ('10', 'NUMBER'), ('times', None), ('.', None)]
- [('Mysore', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('amazing', 'JJ'), ('place', 'NN'), ('on', 'IN'), ('earth', 'NN'), ('.', '.'), ('I', None), ('have', None), ('visited', None), ('Mysore', 'NNP'), ('10', None), ('times', None), ('.', '.')]
训练自己的词性标注器
- import nltk
- import pickle
- # 训练集
- def sampleData():
- return [
- 'Bangalore is the capital of Karnataka.',
- 'Steve Jobs was the CEO of Apple.',
- 'iPhone was Invented by Apple.',
- 'Books can be purchased in Market.',
- ]
- # 逐句分词, 得到词性, 将训练集的词和词性放到字典里
- def buildDictionary():
- dictionary = {}
- for sent in sampleData():
- partsOfSpeechTags = nltk.pos_tag(nltk.word_tokenize(sent))
- for tag in partsOfSpeechTags:
- value = tag[0]
- pos = tag[1]
- dictionary[value] = pos
- return dictionary
- def saveMyTagger(tagger,fileName):
- fileHandle = open(fileName,'wb')
- pickle.dump(tagger,fileHandle) # 写入二进制
- fileHandle.close()
- # 用学习的字典得到 tagger
- def saveMyTraining(fileName):
- tagger = nltk.UnigramTagger(model=buildDictionary())
- saveMyTagger(tagger,fileName)
- # 读取自己的模型
- def loadMyTagger(fileName):
- return pickle.load(open(fileName,'rb'))
- sentence = 'IPhone is purchased by Steve Jobs in Bangalore Market.'
- fileName = 'myTagger.pickle'
- saveMyTraining(fileName)
- myTagger = loadMyTagger(fileName)
- print(myTagger.tag(nltk.word_tokenize(sentence)))
输出:
[('IPhone', None), ('is', 'VBZ'), ('purchased', 'VBN'), ('by', 'IN'), ('Steve', 'NNP'), ('Jobs', 'NNP'), ('in', 'IN'), ('Bangalore', 'NNP'), ('Market', 'NNP'), ('.', '.')]
编写自己的文法
上下文无关文法:
1. 开始符号 / 标记
2. 终结符号集合
3. 非终结符号集合
4. 定义开始符号和规则(产生式)
5. 语言是英文时, a-z 是符号 / 标记 / 字母
6. 语言是数字时, 0-9 是符号 / 标记 / 字母
产生式是用巴克斯 - 诺尔 (BNF) 范式写的
- import nltk
- import string
- from nltk.parse.generate import generate
- import sys
- # 定义一个起始符号为 ROOT 的文法
- productions = [
- 'ROOT -> WORD',
- 'WORD -> \' \'',
- 'WORD -> NUMBER LETTER',
- 'WORD -> LETTER NUMBER',
- ]
- # 添加新的生成方式'NUMBER -> 0|1|2|3'
- digits = list(string.digits) # str 格式的数字
- for digit in digits[:4]:
- productions.append('NUMBER -> \'{w}\''.format(w=digit))
- # 添加新的生成方式'LETTER -> a|b|c|d'
- letters ="'|'".join(list(string.ascii_lowercase)[:4])
- productions.append('LETTER -> \'{w}\''.format(w=letters))
- # 将文法分行存于 grammarString
- grammarString = '\n'.join(productions)
- # 创建文法对象, 并查看之
- grammar = nltk.CFG.fromstring(grammarString)
- print(grammar)
- # 读取语法树 最多个数: 5 最多层数: 4
- for sentence in generate(grammar,n=5,depth=4):
- palindrome = ''.join(sentence).replace(' ','')
- print('Generated Word: {} , Size : {}'.format(palindrome,len(palindrome)))
输出
- Grammar with 12 productions (start state = ROOT)
- ROOT -> Word
- Word -> ' '
- Word -> NUMBER LETTER
- Word -> LETTER NUMBER
- NUMBER -> '0'
- NUMBER -> '1'
- NUMBER -> '2'
- NUMBER -> '3'
- LETTER -> 'a'
- LETTER -> 'b'
- LETTER -> 'c'
- LETTER -> 'd'
- Generated Word: , Size : 0
- Generated Word: 0a , Size : 2
- Generated Word: 0b , Size : 2
- Generated Word: 0c , Size : 2
- Generated Word: 0d , Size : 2
基于概率的上下文无关文法
所有非终结符号 (左侧) 的概率之和等于 1
描述 | 内容 |
---|---|
开始符号 | ROOT |
非终结符号 | WORD,P1,P2,P3,P4 |
终结符号 | ‘A‘,‘B‘,‘C‘,‘D‘,‘E‘,‘F‘,‘G‘,‘H‘ |
- import nltk
- from nltk.parse.generate import generate
- productions = [
- "ROOT -> WORD [1.0]",
- "WORD -> P1 [0.25]",
- "WORD -> P1 P2 [0.25]",
- "WORD -> P1 P2 P3 [0.25]",
- "WORD -> P1 P2 P3 P4 [0.25]",
- "P1 ->'A'[1.0]",
- "P2 ->'B'[0.5]",
- "P2 ->'C'[0.5]",
- "P3 ->'D'[0.3]",
- "P3 ->'E'[0.3]",
- "P3 ->'F'[0.4]",
- "P4 ->'G'[0.9]",
- "P4 ->'H'[0.1]",
- ]
- grammarString = '\n'.join(productions)
- # 创建 grammar 对象
- grammar = nltk.PCFG.fromstring(grammarString)
- print(grammar)
- for sentence in generate(grammar,n=5,depth=4):
- palindrome = ''.join(sentence).replace(' ','')
- print('String : {} , Size : {}'.format(palindrome,len(palindrome)))
输出:
- Grammar with 13 productions (start state = ROOT)
- ROOT -> Word [1.0]
- Word -> P1 [0.25]
- Word -> P1 P2 [0.25]
- Word -> P1 P2 P3 [0.25]
- Word -> P1 P2 P3 P4 [0.25]
- P1 -> 'A' [1.0]
- P2 -> 'B' [0.5]
- P2 -> 'C' [0.5]
- P3 -> 'D' [0.3]
- P3 -> 'E' [0.3]
- P3 -> 'F' [0.4]
- P4 -> 'G' [0.9]
- P4 -> 'H' [0.1]
- String : A , Size : 1
- String : AB , Size : 2
- String : AC , Size : 2
- String : ABD , Size : 3
- String : ABE , Size : 3
编写递归的上下文无关文法
以递归方法生成回文为例, 回文: 比如 01 语言系统的 010010 等
- # 生成偶数回文数字
- import nltk
- import string
- from nltk.parse.generate import generate
- productions = [
- 'ROOT -> WORD',
- "WORD ->' '",
- ]
- alphabets = list(string.digits)
- for alphabet in alphabets:
- productions.append("WORD ->'{w}'WORD'{w}'".format(w=alphabet))
- grammarString = '\n'.join(productions)
- grammar = nltk.CFG.fromstring(grammarString)
- print(grammar)
- for sentence in generate(grammar,n=5,depth=5):
- palindrome = ''.join(sentence).replace(' ','')
- print('Palindrome : {} , Size : {}'.format(palindrome,len(palindrome)))
输出:
- Grammar with 12 productions (start state = ROOT)
- ROOT -> Word
- Word -> ' '
- Word -> '0' Word '0'
- Word -> '1' Word '1'
- Word -> '2' Word '2'
- Word -> '3' Word '3'
- Word -> '4' Word '4'
- Word -> '5' Word '5'
- Word -> '6' Word '6'
- Word -> '7' Word '7'
- Word -> '8' Word '8'
- Word -> '9' Word '9'
- Palindrome : , Size : 0
- Palindrome : 00 , Size : 2
- Palindrome : 0000 , Size : 4
- Palindrome : 0110 , Size : 4
- Palindrome : 0220 , Size : 4
来源: http://www.bubuko.com/infodetail-3111499.html