TensorFlow 从 1 到 2(六) 结构化数据预处理和心脏病预测

#!/usr/bin/env python3
from __future__ import absolute_import, division, print_function
# 引入所需头文件
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
# 打开样本数据文件
# URL = 'https://storage.googleapis.com/applied-dl/heart.csv' #直接从网上打开可以使用这一行
URL = 'heart.csv'
dataframe = pd.read_csv(URL)
# 显示数据的头几行
# dataframe.head()
# 将数据中 20% 分做测试数据
train, test = train_test_split(dataframe, test_size=0.2)
# 将数据的 64% 作为训练数据, 16% 作为验证数据
train, val = train_test_split(train, test_size=0.2)
# 显示训练, 验证, 测试三个数据集的记录数量
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')
# 定义一个函数, 将 Pandas Dataframe 对象转换为 TensorFlow 的 Dataset 对象
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
dataframe = dataframe.copy()
# target 字段是确诊是否罹患心脏病的数据, 取出来作为标注数据
labels = dataframe.pop('target')
# 生成 Dataset
ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
if shuffle:
# 是否需要乱序
ds = ds.shuffle(buffer_size=len(dataframe))
# 设置每批次的记录数量
ds = ds.batch(batch_size)
return ds
# 训练, 验证, 测试三个数据集都转换成 Dataset 类型, 其中训练集需要重新排序
train_ds = df_to_dataset(train)
val_ds = df_to_dataset(val, shuffle=False)
test_ds = df_to_dataset(test, shuffle=False)
# 用于保存所需的数据列
feature_columns = []
# 根据字段名, 添加所需的数据列
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
feature_columns.append(feature_column.numeric_column(header))
# 取出年龄数据
age = feature_column.numeric_column("age")
# 按照 18-25/25-30/30-35/.../60-65 为年龄分段, 最后形成 one-hot 编码
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
# 数据段作为一个新参量添加到数据集
feature_columns.append(age_buckets)
# 获取 thal 字段原始数据
thal = feature_column.categorical_column_with_vocabulary_list(
'thal', ['fixed', 'normal', 'reversible'])
# 做 one-hot 编码
thal_one_hot = feature_column.indicator_column(thal)
# 作为新的数据列添加
feature_columns.append(thal_one_hot)
# 将 thal 嵌入 8 维空间做向量化
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)
# 把年龄段和 thal 字段作为关联属性加入新列
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)
# 定义输入层
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
# 定义完整模型
model = tf.keras.Sequential([
feature_layer,
layers.Dense(128, activation='relu'),
layers.Dense(128, activation='relu'),
layers.Dense(1, activation='sigmoid')
])
# 模型编译
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
# 训练
model.fit(train_ds,
validation_data=val_ds,
epochs=5)
# 评估
test_loss, test_acc = model.evaluate(test_ds)
# 显示评估的正确率
print('===================\nTest accuracy:', test_acc)

来源: https://www.cnblogs.com/andrewwang/p/10748806.html

与本文相关文章

暂无,快来抢沙发吧！