贝叶斯介绍
- # 导入算法包以及数据集
- import numpy as np
- from sklearn import datasets
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import classification_report,confusion_matrix
- from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB# 朴素贝叶斯的三种模型
- # 载入数据
- iris = datasets.load_iris()
- x_train,x_test,y_train,y_test = train_test_split(iris.data, iris.target)
- mul_nb = GaussianNB()# 适合连续性数据, 所以模型效果很好, 其他两种则很差
- mul_nb.fit(x_train,y_train)
- print(classification_report(mul_nb.predict(x_test),y_test))
- print(confusion_matrix(mul_nb.predict(x_test),y_test))
- mul_nb = MultinomialNB()
- mul_nb.fit(x_train,y_train)
- print(classification_report(mul_nb.predict(x_test),y_test))
- print(confusion_matrix(mul_nb.predict(x_test),y_test))
- mul_nb = BernoulliNB()
- mul_nb.fit(x_train,y_train)
- print(classification_report(mul_nb.predict(x_test),y_test))
- print(confusion_matrix(mul_nb.predict(x_test),y_test))
词袋模型介绍
- from sklearn.feature_extraction.text import CountVectorizer# 向量化
- texts=["dog cat fish","dog cat cat","fish bird", 'bird']
- cv = CountVectorizer()
- cv_fit=cv.fit_transform(texts)
- #
- print(cv.get_feature_names())
- print(cv_fit.toarray())
- print(cv_fit.toarray().sum(axis=0))
- ''' ['bird','cat','dog','fish']
- [[0 1 1 1]
- [0 2 1 0]
- [1 0 0 1]
- [1 0 0 0]]
- [2 3 2 2]
- '''
- from sklearn.feature_extraction.text import TfidfVectorizer
- # 文本文档列表
- text = ["The quick brown fox jumped over the lazy dog.",
- "The dog.",
- "The fox"]
- # 创建变换函数
- vectorizer = TfidfVectorizer()
- # 词条化以及创建词汇表
- vectorizer.fit(text)
- # 总结
- print(vectorizer.vocabulary_)
- print(vectorizer.idf_)
- # 编码文档
- vector = vectorizer.transform([text[0]])
- # 总结编码文档
- print(vector.shape)
- print(vector.toarray())
来源: http://www.bubuko.com/infodetail-3282050.html