keras 快速上手 - 基于 python 的深度学习实践_第 8 章_文字生成源代码

源代码如下, 但质量较差
# -*- coding: utf-8 -*-
#!/usr/bin/env python
# coding: utf-8
# # 序列模型
# In[1]:
import pandas as pd
import numpy as np
import gc
import keras
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import io
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
plt.rcParams['figure.figsize']=(20, 10)
# In[2]:
np.random.seed(82832)
# 我们使用《四世同堂》这部小说作为训练集. 读者也可以选用其他长篇小说, 或者爬取网上新闻作为训练集. 通常句式和语言比较有自己风格的长篇小说训练起来相对容易产出好的结果, 就像我们读了武侠小说就比较容易学那种写法一个道理. 因此读者也不妨选用名家的武侠小说, 比如金庸全集等来训练自己的模型. 网上爬取的新闻则具有数据量大, 风格一致的特点, 也适合用来训练模型.
# In[3]:
# 不符合下面固定句长设定的程序要求
# 但是可用于计算平均句长
fileopen = io.open("new.txt", encoding='utf-8')
with fileopen as fo:
    alltext0 = fo.readlines()
# In[4]:
alltext = io.open("new.txt", encoding='utf-8').read()
# In[5]:
len(set(alltext))
# 我们先按照单个字来建模. 首先把所有的字符抽取出来.
# In[6]:
'''
 较 naive 的做法
charset = {}
id = 0
for line in alltext:
    length = len(line)
    for k in range(length):
        w = line[k]
        if not w in charset:
            charset[w]=id
            id+=1
print(len(charset))
'''
# In[7]:
sortedcharset = sorted(set(alltext))
char_indices = dict((c, i) for i, c in enumerate(sortedcharset))
indices_char = dict((i, c) for i, c in enumerate(sortedcharset))
# 现在把原文按照指定长度划分为虚拟的句子. 这个指定虚拟句子的长度一般使用平均句子的字数.
# In[8]:
sentencelength = 0
k=0
for line in alltext0:
    k=k+1
    linelength = len(line)
    sentencelength = (k-1)/k * sentencelength + linelength / k
print(sentencelength)
print(k)
# In[9]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(alltext) - maxlen, step):
    sentences.append(alltext[i: i + maxlen])
    next_chars.append(alltext[i + maxlen])
print('nb sequences:', len(sentences))
# 下面对虚拟句子进行矩阵化
# In[10]:
# 但是这么直接构造得是非常浪费空间的密集矩阵, 这个矩阵占据大约 30GB 的内存, 如果把句长再增加一些, 那么在很多机器上无法运行. 同时这么大的数据无法送给显卡进行计算, 需要每次取一小块批量供 GPU 计算所需. 这时候需要使用 fit_generator 方法, 而不是原来的 fit 方法. fit_generator 将每个 batch 的数据读入, 从原始数据的稀疏矩阵变为当前批量的密集矩阵, 然后计算. 这样对内存的压力大大降低.
# In[12]:
#data generator for fit_generator method
def data_generator(X, y, batch_size):
    if batch_size<1:
       batch_size=256
    number_of_batches = X.shape[0]//batch_size
    counter=0
    shuffle_index = np.arange(np.shape(y)[0])
    np.random.shuffle(shuffle_index)
    #reset generator
    while 1:
        index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = (X[index_batch,:,:]).astype('float32')
        y_batch = (y[index_batch,:]).astype('float32')
        counter += 1
        yield(np.array(X_batch),y_batch)
        if (counter <number_of_batches):
            np.random.shuffle(shuffle_index)
            counter=0
# In[19]:
batch_size=10240
number_of_batches = len(sentences)//batch_size
counter=0
shuffle_index = np.arange(len(sentences))
np.random.shuffle(shuffle_index)
#reset generator
for i in range(number_of_batches):
    index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
    subsentences = [sentences[s] for s in index_batch]
    X = np.zeros((batch_size, maxlen, len(sortedcharset)), dtype=np.bool)
    y = np.zeros((batch_size, len(sortedcharset)), dtype=np.bool)
    for j in range(len(subsentences)):
        for t in range(maxlen):
            char=subsentences[j][t]
            X[j, t, char_indices[char]] = 1
        y[j, char_indices[next_chars[j]]] = 1
    X = X.astype('float32')
    y = y.astype('float32')
    counter += 1
    print( (X.shape, y.shape ))
# 但是这种方法仍然需要一开始生成巨大的特征矩阵和因变量矩阵. 我们可以将生成这两个矩阵的操作移入数据生成器中, 这样无需产生大量数据等待输入 GPU, 而是每次只取所需并生成相应的矩阵并即刻输入 GPU 运算即可.
# In[10]:
# build the model: a single LSTM
batch_size=300
print('Build model...')
model = Sequential()
model.add(LSTM(256,   input_shape=(maxlen, len(sortedcharset)), recurrent_dropout=0.1, dropout=0.1))
#model.add(Dense(1024, activation='relu'))
#model.add(Dropout(0.25))
model.add(Dense(len(sortedcharset)))
model.add(Activation('softmax'))
#optimizer = RMSprop(lr=0.01)
adamoptimizer = keras.optimizers.Adam(lr = 1e-4)
model.compile(loss='categorical_crossentropy', optimizer=adamoptimizer)
print('Finished compiling')
model.summary()
# In[13]:
def data_generator2(sentences, sortedcharset, char_indices, maxlen=40, batch_size=256):
    if batch_size<1:
       batch_size=256
    number_of_batches = len(sentences)//batch_size
    counter=0
    shuffle_index = np.arange(len(sentences))
    np.random.shuffle(shuffle_index)
    #reset generator
    while 1:
        index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
        subsentences = [sentences[s] for s in index_batch]
        X = np.zeros((batch_size, maxlen, len(sortedcharset)), dtype=np.bool)
        y = np.zeros((batch_size, len(sortedcharset)), dtype=np.bool)
        for j, sentence in enumerate(subsentences):
            for t in range(maxlen):
                char=sentence[t]
                X[j, t, char_indices[char]] = 1
            y[j, char_indices[next_chars[j]]] = 1
        X = X.astype('float32')
        y = y.astype('float32')
        counter += 1
        yield((np.array(X), np.array(y)))
        if (counter < number_of_batches):
            np.random.shuffle(shuffle_index)
            counter=0
# In[14]:
model.fit_generator(data_generator2(sentences, sortedcharset, char_indices, maxlen=maxlen, batch_size=batch_size),
                    steps_per_epoch=len(sentences)//batch_size,
                    epochs=25)
# In[20]:
model.save('whk.h5')
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)
start_index=1
sentence = alltext[start_index: start_index + maxlen]
sentence0=sentence
x = np.zeros((1, maxlen, len(sortedcharset)))
generated=''x = np.zeros((1, maxlen, len(sortedcharset))).astype('float32')
for t, char in enumerate(sentence):
     x[0, t, char_indices[char]] = 1.
for i in range(20):
    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, 1.1)
    next_char = indices_char[next_index]
    generated+=next_char
    sentence = sentence[1:]+next_char
print(sentence0)
print("=================")
print(' '.join(generated))
# In[25]:
start_index=2
sentence = alltext[start_index: start_index + maxlen]
sentence0=sentence
x = np.zeros((1, maxlen, len(sortedcharset)))
def GenSentence(original):
    sentence=original
    generated=''
    for i in range(20):
        x = np.zeros((1, maxlen, len(sortedcharset))).astype('float32')
        for t, char in enumerate(sentence):
            x[0, t, char_indices[char]] = 1.
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, 1.20)
        next_char = indices_char[next_index]
        generated+=next_char
        sentence = sentence[1:]+next_char
    return(generated)
# In[26]:
start_index=3
sentence0 = alltext[start_index: start_index + maxlen]
generated0 = GenSentence(sentence0)
print(sentence0+"----->"+generated0)
print("==========")
generated1 = GenSentence(generated0)
print(generated0+"------>"+generated1)
# In[27]:
try:
    del(X, y, model)
except:
    print('Objects not found...')
for i in range(10):
    gc.collect()
来源: http://www.bubuko.com/infodetail-2906995.html
与本文相关文章

暂无,快来抢沙发吧！