sklearn特征选择和分类模型

from sklearn.externals.joblib import Memory from sklearn.datasets import load_svmlight_file mem = Memory("./mycache")@mem.cache def get_data() : data = load_svmlight_file("labeled_fea.txt") return data[0],
data[1]
X,
y = get_data()
train_X = X[0 : 800000] train_y = y[0 : 800000] test_X = X[800000 : ] test_y = y[800000 : ] print(train_X.shape) print(test_X.shape)
from sklearn.feature_selection import SelectKBest,
chi2 from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import BernoulliNB,
MultinomialNB from sklearn.linear_model import RidgeClassifier from sklearn.linear_model import Perceptron from sklearn.neighbors import NearestCentroid from sklearn.linear_model import SGDClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import GradientBoostingClassifier from sklearn import metrics from time import time
#独立的特征选择ch2 = SelectKBest(chi2, k = 10000) train_X = ch2.fit_transform(train_X, train_y) test_X = ch2.transform(test_X)
#依据一个分类模型。训练模型后。进行測试def benchmark(clf) : print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(train_X, train_y) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(test_X) test_time = time() - t0 print("test time:  %0.3fs" % test_time) score = metrics.accuracy_score(test_y, pred) print("accuracy:   %0.3f" % score) clf_descr = str(clf).split('(')[0]
return clf_descr,
score,
train_time,
test_time
clf = RandomForestClassifier(n_estimators = 100)#clf = RidgeClassifier(tol = 1e-2, solver = "lsqr")#clf = Perceptron(n_iter = 50)#clf = LinearSVC()#clf = GradientBoostingClassifier()
#clf = SGDClassifier(alpha = .0001, n_iter = 50, penalty = "l1")#clf = SGDClassifier(alpha = .0001, n_iter = 50, penalty = "elasticnet")
#clf = NearestCentroid()#clf = MultinomialNB(alpha = .01)#clf = BernoulliNB(alpha = .01)
#pipeline模型特征选择和分类模型结合在一起#clf = Pipeline([('feature_selection', LinearSVC(penalty = "l1", dual = False, tol = 1e-3)), ('classification', LinearSVC())])
benchmark(clf)

值得注意的是，上面的程序训练和预測阶段都是在同一份程序运行。而实际应用中。训练和预測是分开的。因此，要使用 python 的对象序列化特征。每次训练完之后。序列化模型对象。保存模型的状态，预測时反序列化模型对象。还原模型的状态。

參考资料：

http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_svmlight_file.html

http://scikit-learn.org/stable/modules/generated/sklearn.datasets.dump_svmlight_file.html

http://scikit-learn.org/stable/modules/feature_selection.html#feature-selection

http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html#example-text-document-classification-20newsgroups-py

来源: http://www.bubuko.com/infodetail-2223597.html

与本文相关文章

暂无,快来抢沙发吧！