源代码如下, 但质量较差
- # -*- coding: utf-8 -*-
- #!/usr/bin/env python
- # coding: utf-8
- # # 序列模型
- # In[1]:
- import pandas as pd
- import numpy as np
- import gc
- import keras
- from keras.models import Sequential
- from keras.models import load_model
- from keras.layers import Dense, Activation, Dropout
- from keras.layers import LSTM
- from keras.optimizers import RMSprop
- from keras.utils.data_utils import get_file
- import io
- import matplotlib.pyplot as plt
- from sklearn.preprocessing import MinMaxScaler
- plt.rcParams['figure.figsize']=(20, 10)
- # In[2]:
- np.random.seed(82832)
- # 我们使用《四世同堂》这部小说作为训练集. 读者也可以选用其他长篇小说, 或者爬取网上新闻作为训练集. 通常句式和语言比较有自己风格的长篇小说训练起来相对容易产出好的结果, 就像我们读了武侠小说就比较容易学那种写法一个道理. 因此读者也不妨选用名家的武侠小说, 比如金庸全集等来训练自己的模型. 网上爬取的新闻则具有数据量大, 风格一致的特点, 也适合用来训练模型.
- # In[3]:
- # 不符合下面固定句长设定的程序要求
- # 但是可用于计算平均句长
- fileopen = io.open("new.txt", encoding='utf-8')
- with fileopen as fo:
- alltext0 = fo.readlines()
- # In[4]:
- alltext = io.open("new.txt", encoding='utf-8').read()
- # In[5]:
- len(set(alltext))
- # 我们先按照单个字来建模. 首先把所有的字符抽取出来.
- # In[6]:
- '''
- 较 naive 的做法
- charset = {}
- id = 0
- for line in alltext:
- length = len(line)
- for k in range(length):
- w = line[k]
- if not w in charset:
- charset[w]=id
- id+=1
- print(len(charset))
- '''
- # In[7]:
- sortedcharset = sorted(set(alltext))
- char_indices = dict((c, i) for i, c in enumerate(sortedcharset))
- indices_char = dict((i, c) for i, c in enumerate(sortedcharset))
- # 现在把原文按照指定长度划分为虚拟的句子. 这个指定虚拟句子的长度一般使用平均句子的字数.
- # In[8]:
- sentencelength = 0
- k=0
- for line in alltext0:
- k=k+1
- linelength = len(line)
- sentencelength = (k-1)/k * sentencelength + linelength / k
- print(sentencelength)
- print(k)
- # In[9]:
- maxlen = 40
- step = 3
- sentences = []
- next_chars = []
- for i in range(0, len(alltext) - maxlen, step):
- sentences.append(alltext[i: i + maxlen])
- next_chars.append(alltext[i + maxlen])
- print('nb sequences:', len(sentences))
- # 下面对虚拟句子进行矩阵化
- # In[10]:
- # 但是这么直接构造得是非常浪费空间的密集矩阵, 这个矩阵占据大约 30GB 的内存, 如果把句长再增加一些, 那么在很多机器上无法运行. 同时这么大的数据无法送给显卡进行计算, 需要每次取一小块批量供 GPU 计算所需. 这时候需要使用 fit_generator 方法, 而不是原来的 fit 方法. fit_generator 将每个 batch 的数据读入, 从原始数据的稀疏矩阵变为当前批量的密集矩阵, 然后计算. 这样对内存的压力大大降低.
- # In[12]:
- #data generator for fit_generator method
- def data_generator(X, y, batch_size):
- if batch_size<1:
- batch_size=256
- number_of_batches = X.shape[0]//batch_size
- counter=0
- shuffle_index = np.arange(np.shape(y)[0])
- np.random.shuffle(shuffle_index)
- #reset generator
- while 1:
- index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
- X_batch = (X[index_batch,:,:]).astype('float32')
- y_batch = (y[index_batch,:]).astype('float32')
- counter += 1
- yield(np.array(X_batch),y_batch)
- if (counter <number_of_batches):
- np.random.shuffle(shuffle_index)
- counter=0
- # In[19]:
- batch_size=10240
- number_of_batches = len(sentences)//batch_size
- counter=0
- shuffle_index = np.arange(len(sentences))
- np.random.shuffle(shuffle_index)
- #reset generator
- for i in range(number_of_batches):
- index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
- subsentences = [sentences[s] for s in index_batch]
- X = np.zeros((batch_size, maxlen, len(sortedcharset)), dtype=np.bool)
- y = np.zeros((batch_size, len(sortedcharset)), dtype=np.bool)
- for j in range(len(subsentences)):
- for t in range(maxlen):
- char=subsentences[j][t]
- X[j, t, char_indices[char]] = 1
- y[j, char_indices[next_chars[j]]] = 1
- X = X.astype('float32')
- y = y.astype('float32')
- counter += 1
- print( (X.shape, y.shape ))
- # 但是这种方法仍然需要一开始生成巨大的特征矩阵和因变量矩阵. 我们可以将生成这两个矩阵的操作移入数据生成器中, 这样无需产生大量数据等待输入 GPU, 而是每次只取所需并生成相应的矩阵并即刻输入 GPU 运算即可.
- # In[10]:
- # build the model: a single LSTM
- batch_size=300
- print('Build model...')
- model = Sequential()
- model.add(LSTM(256, input_shape=(maxlen, len(sortedcharset)), recurrent_dropout=0.1, dropout=0.1))
- #model.add(Dense(1024, activation='relu'))
- #model.add(Dropout(0.25))
- model.add(Dense(len(sortedcharset)))
- model.add(Activation('softmax'))
- #optimizer = RMSprop(lr=0.01)
- adamoptimizer = keras.optimizers.Adam(lr = 1e-4)
- model.compile(loss='categorical_crossentropy', optimizer=adamoptimizer)
- print('Finished compiling')
- model.summary()
- # In[13]:
- def data_generator2(sentences, sortedcharset, char_indices, maxlen=40, batch_size=256):
- if batch_size<1:
- batch_size=256
- number_of_batches = len(sentences)//batch_size
- counter=0
- shuffle_index = np.arange(len(sentences))
- np.random.shuffle(shuffle_index)
- #reset generator
- while 1:
- index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
- subsentences = [sentences[s] for s in index_batch]
- X = np.zeros((batch_size, maxlen, len(sortedcharset)), dtype=np.bool)
- y = np.zeros((batch_size, len(sortedcharset)), dtype=np.bool)
- for j, sentence in enumerate(subsentences):
- for t in range(maxlen):
- char=sentence[t]
- X[j, t, char_indices[char]] = 1
- y[j, char_indices[next_chars[j]]] = 1
- X = X.astype('float32')
- y = y.astype('float32')
- counter += 1
- yield((np.array(X), np.array(y)))
- if (counter < number_of_batches):
- np.random.shuffle(shuffle_index)
- counter=0
- # In[14]:
- model.fit_generator(data_generator2(sentences, sortedcharset, char_indices, maxlen=maxlen, batch_size=batch_size),
- steps_per_epoch=len(sentences)//batch_size,
- epochs=25)
- # In[20]:
- model.save('whk.h5')
- def sample(preds, temperature=1.0):
- # helper function to sample an index from a probability array
- preds = np.asarray(preds).astype('float64')
- preds = np.log(preds) / temperature
- exp_preds = np.exp(preds)
- preds = exp_preds / np.sum(exp_preds)
- probas = np.random.multinomial(1, preds, 1)
- return np.argmax(probas)
- start_index=1
- sentence = alltext[start_index: start_index + maxlen]
- sentence0=sentence
- x = np.zeros((1, maxlen, len(sortedcharset)))
- generated=''x = np.zeros((1, maxlen, len(sortedcharset))).astype('float32')
- for t, char in enumerate(sentence):
- x[0, t, char_indices[char]] = 1.
- for i in range(20):
- preds = model.predict(x, verbose=0)[0]
- next_index = sample(preds, 1.1)
- next_char = indices_char[next_index]
- generated+=next_char
- sentence = sentence[1:]+next_char
- print(sentence0)
- print("=================")
- print(' '.join(generated))
- # In[25]:
- start_index=2
- sentence = alltext[start_index: start_index + maxlen]
- sentence0=sentence
- x = np.zeros((1, maxlen, len(sortedcharset)))
- def GenSentence(original):
- sentence=original
- generated=''
- for i in range(20):
- x = np.zeros((1, maxlen, len(sortedcharset))).astype('float32')
- for t, char in enumerate(sentence):
- x[0, t, char_indices[char]] = 1.
- preds = model.predict(x, verbose=0)[0]
- next_index = sample(preds, 1.20)
- next_char = indices_char[next_index]
- generated+=next_char
- sentence = sentence[1:]+next_char
- return(generated)
- # In[26]:
- start_index=3
- sentence0 = alltext[start_index: start_index + maxlen]
- generated0 = GenSentence(sentence0)
- print(sentence0+"----->"+generated0)
- print("==========")
- generated1 = GenSentence(generated0)
- print(generated0+"------>"+generated1)
- # In[27]:
- try:
- del(X, y, model)
- except:
- print('Objects not found...')
- for i in range(10):
- gc.collect()
来源: http://www.bubuko.com/infodetail-2906995.html