机器学习 -- 数据预处理

基础

机器学习主要有两种, 监督学习和非监督学习. 监督学习就是督促计算机去学习, 明确告诉它目标是什么, 非监督学习是让计算机 "自学成才", 没有设定目标, 学习完告诉我你学到了什么

# encoding=utf-8
 from sklearn import linear_model
 import matplotlib.pyplot as plt
 import numpy as np
 # 房屋面积与价格历史数据 (CSV 文件)
 data = np.array([[150, 6450], [200, 7450], [250, 8450], [300, 9450], [350, 11450], [400, 15450], [600, 18450]])
 # print data[:, 0].reshape(-1, 1)
 # plt.scatter(data[:, 0], data[:, 1], color='blue')
 # plt.show()
 # 线性模型
 # regr = linear_model.LinearRegression()
 # 拟合
 # regr.fit(data[:, 0].reshape(-1, 1), data[:, 1])
 # 直线的斜率, 截距
 # a, b = regr.coef_, regr.intercept_
 # print a, b
 # plt.plot(data[:,0],regr.predict(data[:,0].reshape(-1,1)),color='red',linewidth=4)
 # plt.scatter(data[:, 0], regr.predict(data[:, 0].reshape(-1, 1)), color='red')
 # 预测 175 天和 800 天房价数据
 # print regr.predict(175)
 # print regr.predict(800)
 # plt.show()

数据预处理

导入类库

from sklearn.feature_extraction import DictVectorizer
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 import jieba
 from sklearn.feature_selection import VarianceThreshold
 from sklearn.preprocessing import StandardScaler, MinMaxScaler

数据处理

字典数据抽取

代码

def dictvec():
     '''
     字典数据抽取: DictVectorizer
     sprase: 为 False 时生成矩阵形式
     fit_transform: 训练数据集
     get_feature_names: 获取特征名, 即列名或表头
     inverse_transform: 得到每行数据中为 1 的数据 (为 1 即为存在)
     :return:
     '''
     dict = DictVectorizer(sparse=False)
     data = dict.fit_transform(
         [{'city': '北京', 'pos': '北方', 'temperature': 100},
          {'city': '上海', 'pos': '南方', 'temperature': 60},
          {'city': '深圳', 'pos': '南方', 'temperature': 30},
          {'city': '重庆', 'pos': '南方', 'temperature': 70},
          {'city': '北京', 'pos': '北方', 'temperature': 100}])
     print(dict.get_feature_names())
     print(dict.inverse_transform(data))
     print(data)
     return None
结果
'''['city = 上海','city = 北京','city = 深圳','city = 重庆','pos = 北方','pos = 南方','temperature']
[{'city = 北京': 1.0, 'pos = 北方': 1.0, 'temperature': 100.0}, {'city = 上海': 1.0, 'pos = 南方': 1.0, 'temperature': 60.0}, {'city = 深圳': 1.0, 'pos = 南方': 1.0, 'temperature': 30.0}, {'city = 重庆': 1.0, 'pos = 南方': 1.0, 'temperature': 70.0}, {'city = 北京': 1.0, 'pos = 北方': 1.0, 'temperature': 100.0}]
[[  0.   1.   0.   0.   1.   0. 100.]
 [  1.   0.   0.   0.   0.   1.  60.]
 [  0.   0.   1.   0.   0.   1.  30.]
 [  0.   0.   0.   1.   0.   1.  70.]
 [  0.   1.   0.   0.   1.   0. 100.]]
'''
英文特征值化
代码
def countvec():
     '''
     对文本进行特征值化: CountVectorizer 对文本中的词可进行统计
     排序: 会按照英文常用性进行排序
     停用: a 等无显著特征的词会被停用
     :return: None
     '''
     cv = CountVectorizer()
     data = cv.fit_transform(['this is a test test', 'we have a test'])
     print(cv.get_feature_names())
     print(data.toarray())
     return None
结果
'''['have','is','test','this','we']
[[0 1 2 1 0]
 [1 0 1 0 1]]
'''
中文特征值化
代码
def cutword():
    # 分词
    con1 = jieba.cut('天空灰得像哭过')
    con2 = jieba.cut('离开你以后')
    con3 = jieba.cut('并没有很自由')
    # 转换成列表
    content1 = list(con1)
    content2 = list(con2)
    content3 = list(con3)
    # 把列表转换成字符串
    c1 = ' '.join(content1)
    c2 = ' '.join(content2)
    c3 = ' '.join(content3)
    return c1, c2, c3
def hanzivec():
     '''
     对文本进行特征值化: CountVectorizer 对文本中的词可进行统计
     :return: None
     '''
     c1, c2, c3 = cutword()
     cv = CountVectorizer()
     print(c1, c2, c3)
     data = cv.fit_transform([c1, c2, c3])
     print(cv.get_feature_names())
     print(data.toarray())
     return None
结果
'''
 天空 灰得 像 哭 过 离开 你 以后 并 没有 很 自由
['以后', '天空', '没有', '灰得', '离开', '自由']
[[0 1 0 1 0 0]
 [1 0 0 0 1 0]
 [0 0 1 0 0 1]]
'''
词频
代码
def tfidfvec():
    '''
    中文特征值化
    TF(词频): 在一篇文章中出现该词的次数与文章中总词数的比值,(出现次数 / 文章总词数)
    IDF(逆向词频):log(文章总数 / 该词出现的文章数)
    TF,IDF 值越大说明该词特征越显著
    '''
    c1, c2, c3 = cutword()
    print(c1, c2, c3)
    tf = TfidfVectorizer()
    data = tf.fit_transform([c1, c2, c3])
    print(tf.get_feature_names())
    print(data.toarray())
    return None
结果
'''
 天空 灰得 像 哭 过 离开 你 以后 并 没有 很 自由
['以后', '天空', '没有', '灰得', '离开', '自由']
[[0.         0.70710678 0.         0.70710678 0.         0.        ]
 [0.70710678 0.         0.         0.         0.70710678 0.        ]
 [0.         0.         0.70710678 0.         0.         0.70710678]]
'''
标准化缩放
代码
def stand():
     '''
     标准化缩放: 特征列均值为 0, 标准差为 1
     将数据差值很大, 但变化率等相近的数据标准化, 类似于横坐标是 1000,2000,3000, 纵坐标是 1,2,3
     :return:
     '''
     std = StandardScaler()
     # data = std.fit_transform([[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]])
     data = std.fit_transform([[1., 2., 3.], [100., 200., 300.], [1000., 2000., 3000.]])
     print(data)
     return None
结果
'''
[[-0.81438366 -0.81438366 -0.81438366]
 [-0.59409956 -0.59409956 -0.59409956]
 [ 1.40848322  1.40848322  1.40848322]]
'''
归一化
代码
def mm():
     '''
     归一化处理: 类似于上面标准化, 可以设定归一化后的特征值范围
     :return:
     '''
     mm = MinMaxScaler(feature_range=(2, 3))
     data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
     print(data)
     return None
结果
'''
[[3.         2.         2.         2.        ]
 [2.         3.         3.         2.83333333]
 [2.5        2.5        2.6        3.        ]]
'''
特征选择
代码
def var():
     '''
     特征选择 - 删除低方差的特征
     threshold: 阈值, 小于设定阈值方差的特征列将被剔除
     注: 方差小的, 特征不显著
     :return:
     '''
     var = VarianceThreshold(threshold=1.0)
     data = var.fit_transform([[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]])
     print(data)
     return None
结果
'''
[[0]
 [4]
 [1]]
'''

来源: https://www.cnblogs.com/siplips/p/9741626.html

与本文相关文章

暂无,快来抢沙发吧！