当前位置：

首页
/
IT
/
程序
/
Python
/
数据分析 --numpy

数据分析 --numpy

DIKW
DATA-->INFOMATION-->KNOWLEDGE-->WISDOM

数据 -->信息 -->知识 -->智慧

爬虫 -->数据库 -->数据分析 -->机器学习

信息: 通过某种方式组织和处理数据, 分析数据间的关系, 数据就有了意义

知识: 如果说数据是一个事实的集合, 从中可以得出关于事实的结论. 那么知识 (Knowledge) 就是信息的集合, 它使信息变得有用. 知识是对信息的应用, 是一个对信息判断和确认的过程, 这个过程结合了经验, 上下文, 诠释和反省. 知识可以回答 "如何?" 的问题, 可以帮助我们建模和仿真

智慧: 智慧可以简单的归纳为做正确判断和决定的能力, 包括对知识的最佳使用. 智慧可以回答 "为什么" 的问题. 回到前面的例子, 根据故障对客户的业务影响可以识别改进点

数学

微积分

# import math
 # s = 0
 # for i in range(1, 1001):
 #     x = (math.pi / 1000) * i
 #     y = math.sin((math.pi / 1000) * i)
 #     s = (math.pi / 1000) * y + s
 # print(s)
# import numpy as np
 # def sin_integral(l,r,p):
 #     sum_result = 0
 #     delta = (r - l) / p
 #     for i in range(p):
 #         left = i * delta
 #         delta_area = delta * np.sin(left)
 #         sum_result += delta_area
 #     return sum_result
 # print(sin_integral(0.0,np.pi,100000))
numpy
# coding=utf-8
 import numpy as np
 import matplotlib.pyplot as pt
 # x 的 3 次方
 # X = np.linspace(-100, 100, 100)
 # Y = X * X * X
 # tan
 # X = np.linspace(-np.pi//2,np.pi//2,1000)
 # Y = np.tan(X)
 # log
 # X = np.linspace(-10,10,100)
 # Y = np.log(X)
 #
 # pt.plot(X, Y)
 # pt.show()
 # -----------------------------------------------------------------------
 # 鸡兔同笼
 # for x in range(36):
 #     y = 35 - x
 #     if x+2*y == 47:
 #         print(x,y)
 # sinx 面积 0-pi
 # import math
 # s = 0
 # for i in range(1, 1001):
 #     x = (math.pi / 1000) * i
 #     y = math.sin((math.pi / 1000) * i)
 #     s = (math.pi / 1000) * y + s
 # print(s)
 # 承上封装为函数
 # import numpy as np
 # def sin_integral(l,r,p):
 #     sum_result = 0
 #     delta = (r - l) / p
 #     for i in range(p):
 #         left = i * delta
 #         delta_area = delta * np.sin(left)
 #         sum_result += delta_area
 #     return sum_result
 # print(sin_integral(0.0,np.pi,100000))
 # --------------------------------------------------------
 # a = np.arange(18).reshape(3, 6)     #二维数组矩阵
 a = np.arange(24).reshape(2,3,4)     #三维数组矩阵
 # print a
 # print a.ndim    #矩阵维数
 # print np.ndim([[1,1],[2,2]])    #矩阵维数
 # print a.dtype.name      #数值类型 int32
 # print a.size    #元素个数
 # print a.itemsize    #每个数组元素的字节大小
 # print type(a)         #a 的类型
 b = np.array([[1.2, 2, 3], [4, 5, 6]])
 # print b.dtype       #float64 类型的数组
 c = np.array([[1, 1], [2, 2]], dtype=complex)
 # print c,c.dtype     #复数类型 complex128 类型的数组
 z = np.zeros((3, 4))
 # print z     #创建全零数组, 默认为 float64 形式
 o = np.ones((2, 3, 4), dtype=np.int16)
 # print o
 # 创建一个三维全 1 的数组, 并且创建时指定类型, 可以认为是一个长方体里有序的充满了 1
 #两层, 每一层是三行四列的二维数组
 e = np.empty((2,3))
 # print e     #创建一个二维空数组, 电脑不同显示不同
 # f = np.arange(1,9,2)
 f = np.arange(0,3,0.5)
 # print f     #[1 3 5 7],2 和 0.5 为步进值
 # print np.arange(10000)  #如果数组太大而无法全部打印, NumPy 会自动跳过中央部分, 只能打印出边界部分(首尾)
 # np.set_printoptions(threshold='nan')      #禁用此省略并强制 NumPy 打印整个数组, 使用 set_printoptions 更改打印选项
 # print np.arange(10000).reshape(100,100)
 a = np.array([20,30,40,50])
 b = np.arange(4)
 # print a-b           #相减
 # print b**2              #平方
 # print 10*np.sin(a)      #a 数组先进行 sin 运算, 然后结果乘 10

运算

阶乘

np.math.factorial(100)

对数

np.log()

开方

1. 准备每一个条件的数据表示 2. 准备程序的逻辑 3. 将你的数据应用到逻辑 4. 优化结构

# np.sqrt(3)
 # A = (2, 7)
 # B = (8, 3)  # 欧几里得距离
 # AB = np.sqrt((A[0] - B[0]) ** 2 + (A[1] - B[1]) ** 2)
 # print AB

三角函数

np.arctan()
np.cos()
np.sin()

np.rad2deg()-- 弧度转角度

np.deg2rad-- 角度转弧度

......
# x = np.array([3, 0]) + np.array([0, 3])
 # x = np.array([3,3])
 # l = np.linalg.norm(x)   #矢量 x 的范数(长度)
 # h = np.arctan(3.0/3.0)  #计算弧度 π/4
 # j = np.rad2deg(h)       #弧度转角度 45 度
 # np.deg2rad()          #角度转弧度
 # print j

点乘

numpy 数组 (矢量) 默认的 +-*/ 操作都是对应位置的元素相操作

array1.dot(array2)
 # d1 = np.array([2, 7])
 # d2 = np.array([8, 3])
 # print d1.dot(d2)        #点乘(内积) 2*8+7*3 结果: 实数
# 余弦相似度, 向量内积, 对应元素相乘再相加
 '''
 设两个向量分别为 a=(x1,y1),b=(x2,y2),
 其夹角为α, 因为 ab=|a||b|cosα,
 所以 cosα=ab/|a||b|=(x1y1+x2,y2)/(根号 (x1^2+y1^2) 根号(x2^2+y1^2))
 '''
 # d12 = d1.dot(d2)                    #d1.d2
 # d1_len = np.linalg.norm(d1)         #|d1|
 # d2_len = np.linalg.norm(d2)         #|d2|
 # cosa = d12 / (d1_len * d2_len)      #余弦值 cosa
 # a = np.rad2deg(np.arccos(cosa))     #角度 a
 # print a

复数

# a = 1 + 2j              #复数  complex
 # b = 2 + 3j             #泰勒级数, 傅里叶级数
 # print a,type(a),a*b,a-b
# np.nan   #not a number 当数据读取缺失或计算异常时会出现, 本质是一个浮点数
 # np.exp(10)  #以 e 为底的指数
 # np.log(10)    #以 e 为底的对数, 即 ln
 # np.e          #e,2.71828182
 # np.inf          #无穷大

函数

空数组

默认值是 0 或正无穷或负无穷

实数在计算机里只能用浮点数无限逼近精度, 不能确切表示, 所以在处理 0 的时候要格外小心 ;a - b <0.1e-10 相减的时候当结果小于一个极小的数值就认为相等

np.empty((3, 3))

数组

矢量是有方向和长度的变量, 可以用 numpy 的多位数组来表示, 二维矢量就是平面的一个点

1 np.array([[1,2,3],[4,5,6]])

范数

矢量的范数(长度)

np.linalg.norm(np.array([3,3]))

类型转换

1 array.astype(np.int)

数组信息

array.shape
 array.shape[0]
 array.shape[1]
# 使用两个矢量相减, 可以计算两点距离
 d1 = np.array([2, 7])
 # d2 = np.array([8,3])
 # np.linalg.norm(d1-d2)
 # d1.astype(np.int)    #将数组类型强制转换为 int
 # d1.shape    #返回数组的行列数
 # d1.shape[0]     #返回数组的行数
 # d1.shape[1]     #返回数组的列数

均分

# np.linspace()
# xs = np.linspace(-1000, 1000, 10000)
 # idx = []
 # max_result = []
 # for x in xs:
 #     y = -3 * (x ** 2) + 5 * x - 6
 #     idx.append(x)
 #     max_result.append(y)
 # print max(max_result),idx[max_result.index(max(max_result))]
 # def poly_test(l,r):
 #     r_len = r - l
 #     max_num = l
 #     m_idx = l
 #     for i in range(r_len):
 #         r_num = l + i
 #         result = (r_num ** 2) * -3 + (5 * r_num) - 6
 #         if result> max_num:
 #             max_num = result
 #             m_idx = i
 #     return max_num,m_idx
 # print poly_test(-10000,10000)
# 在 X 轴上生成 2000 个从 - 10000 到 10000 的离散点
 # 使用矢量计算直接生成对应上述多项式的所有结果, 这里没有使用循环, 一次计算了 20000 个结果
 # X = np.linspace(-1000, 10000, 20000)
 # Y = (X ** 2) * -3 + 5 * X - 6  # 矢量运算, 计算机会加速此类运算
 # Y.max()  # 获取当前矢量的最大值
 # Y.argmax()  # 获取当前数组最大值对应的索引(X 值, 不是函数中的 X)

数组切片

二维数组

n_array = np.arange(25).reshape(5, 5)
 # print n_array      #第一个数选行, 第二个选列
 # print n_array[:,:2]     #前两列
 # print n_array[:3,:]         #前三行
 # print n_array[1:4,1:4]      #1-3 行且 1-3 列
 # print n_array[2,2]          #第 3 行的第 3 个数
 # print n_array[2][2]         #同上
 # print n_array[::-2]   #隔行选择
 # print n_array[::2]

三维数组

n3_array = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [3, 2, 1]], [[6, 5, 4], [9, 8, 7]]])
 # print n3_array          #第一个数选层, 第二个数选行, 第三个数选列
 # print n3_array[:,:,2]   #最后一列
 # print n3_array[:,:1,:].sum()    #每一层的第一行
 # print n3_array[:1,:,:].mean()       #平均值
 # print n3_array[:,1,:2].std()        #方差小, 更稳定

数组元素选取

# d1 <3                           #返回满足条件的布尔类型矩阵
 # np.count_nonzero(d1 < 3)        #统计数组中小于 3 的元素个数
 # d1[d1<3]                         #选出指定范围的元素

学生成绩案例

数据准备

# score_array = np.loadtxt(open('score.csv', 'rb'), delimiter=',', dtype=int)
 score_array = np.genfromtxt('score.csv', delimiter=',', dtype=int)
 students = []
 courses = ['数学', '语文', '化学', '地理', '音乐', '体育']

课程成绩最好

def course_score():
     course_score_max = 0
     cid_max = -1
     for c in range(6):
         course_score = score_array[:, c].sum()
         print course_score
         if course_score_max < course_score:
             course_score_max = course_score
             cid_max = c
     return courses[cid_max], course_score_max

学生成绩最好

def student_score():
     student_score_max = 0
     sid_max = -1
     for s in range(6):
         student_score = score_array[s, :].sum()
         print '{}号学生成绩:{}分'.format(s, student_score)
         if student_score_max < student_score:
             student_score_max = student_score
             sid_max = s
     return '{}号学生成绩最好, 总分为 {} 分'.format(sid_max, student_score_max)

学生偏科

def pian():
     pian_max = 0
     pid_max = -1
     for p in range(6):
         student_score_std = score_array[p, :].std()
         print '{}号学生成绩方差为:{}'.format(p, student_score_std)
         if pian_max < student_score_std:
             pian_max = student_score_std
             pid_max = p
     return '{}号学生偏科, 方差为:{}'.format(pid_max, pian_max)

主课成绩最好

def main_course_score():
     main_course_score_max = 0
     cid_max = -1
     for c in range(3):
         main_course_score = score_array[:, c].sum()
         print main_course_score
         if main_course_score_max < main_course_score:
             main_course_score_max = main_course_score
             cid_max = c
     return cid_max, main_course_score_max

该班主课副课对比哪个成绩好

def than():
     main_course_std = 0
     side_course_std = 0
     for t in range(3):
         main_course_std += score_array[:, t].std()
     main_course_std /= 3
     for t in range(3, 6):
         side_course_std += score_array[:, t].std()
     side_course_std /= 3
     if main_course_std> side_course_std:
         return '该班主课成绩更好'
     else:
         return '该班副课成绩更好'

这个班有多少学生出现了不及格

def bad():
     badstudent = []
     for b in range(6):
         if min(score_array[b, :]) <60:
             badstudent.append(b)
             # print '{}学生不及格'.format(b)
     return '不及格学生:{}'.format(badstudent)

封装成类

name_dic = {0: '数学', 1: '语文', 2: '化学', 3: '地理', 4: '音乐', 5: '体育'}
 class CoursaDesc(object):
     def __init__(self):
         self.name = ''
         self.std = 0
         self.max = 0
         self.min = 0
         self.mean = 0
         self.num = 0
 class ComputerDesc(object):
     def __init__(self, n_array):
         self.score_array = n_array
         self.result = []
     def counter_all_coursa(self):
         for i in range(6):
             c_desc = CoursaDesc()
             c_desc.name = name_dic[i]
             c_desc.std = self.score_array[:, i].std()
             c_desc.mean = self.score_array[:, i].mean()
             c_desc.max = self.score_array[:, i].max()
             c_desc.min = self.score_array[:, i].min()
             c_desc.sum = self.score_array[:, i].sum()
             self.result.append(c_desc)
     def best_coursa(self):
         # std_list = [coursa.std for coursa in self.result]
         # sum_list = [coursa.sum for coursa in self.result]
         std_list = []
         sum_list = []
         for coursa in self.result:
             std_list.append(coursa.std)
             sum_list.append(coursa.sum)
         std_array = np.array(std_list)
         sum_array = np.array(sum_list)
         max_sum_coursa = sum_array.max()
         max_sum_index = sum_array.argmax()
         min_std_coursa = std_array.min()
         min_std_index = std_array.argmin()
         if max_sum_index == min_std_index:
             return name_dic[max_sum_index]
         else:
             # 方差最小的课程的成绩总和
             min_std_coursa_sum = sum_array[min_std_index]
             # 总和成绩最大的课程的方差
             max_sum_coursa_std = std_array[max_sum_index]
             sum_delta = max_sum_coursa - min_std_coursa_sum
             std_delta = max_sum_coursa_std - min_std_coursa
             sum_percent = sum_delta / max_sum_coursa
             std_percent = std_delta / min_std_coursa
             if sum_percent < 0.05 and std_percent> 0.2:
                 return name_dic[min_std_index]
 if __name__ == '__main__':
     c = ComputerDesc(score_array)
     c.counter_all_coursa()
     print c.best_coursa()

来源: https://www.cnblogs.com/siplips/p/9832180.html

与本文相关文章

暂无,快来抢沙发吧！