- from numpy import array
- from keras.preprocessing.text import one_hot
- from keras.preprocessing.sequence import pad_sequences
- from keras.models import Sequential
- from keras.layers import Dense
- from keras.layers import Flatten
- from keras.layers.embeddings import Embedding
- # define documents
- docs = ['Well done!',
- 'Good work',
- 'Great effort',
- 'nice work',
- 'Excellent!',
- 'Weak',
- 'Poor effort!',
- 'not good',
- 'poor work',
- 'Could have done better.']
- # define class labels
- labels = array([1,1,1,1,1,0,0,0,0,0])
- # integer encode the documents
- vocab_size = 50
- encoded_docs = [one_hot(d, vocab_size) for d in docs]
- print(encoded_docs)
- # pad documents to a max length of 4 words
- max_length = 4
- padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
- print(padded_docs)
- # define the model
- model = Sequential()
- model.add(Embedding(vocab_size, 8, input_length=max_length))
- model.add(Flatten())
- model.add(Dense(1, activation='sigmoid'))
- # compile the model
- model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
- # summarize the model
- print(model.summary())
- # fit the model
- model.fit(padded_docs, labels, epochs=50, verbose=0)
- # evaluate the model
- loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
- print('Accuracy: %f' % (accuracy*100))
例子 2
- # -*- coding: utf-8 -*-
- from keras.layers.core import Activation, Dense
- from keras.layers.embeddings import Embedding
- from keras.layers.recurrent import LSTM
- from keras.models import Sequential
- from keras.preprocessing import sequence
- from sklearn.model_selection import train_test_split
- import collections
- import nltk
- nltk.download('punkt')
- import numpy as np
- ## EDA
- maxlen = 0
- word_freqs = collections.Counter()
- num_recs = 0
- with open('train_data.txt','r+') as f:
- for line in f:
- label, sentence = line.strip().split("\t")
- words = nltk.word_tokenize(sentence.lower())
- if len(words)> maxlen:
- maxlen = len(words)
- for Word in words:
- word_freqs[Word] += 1
- num_recs += 1
- print('max_len',maxlen)
- print('nb_words', len(word_freqs))
- ## 准备数据
- MAX_FEATURES = 2000
- MAX_SENTENCE_LENGTH = 40
- vocab_size = min(MAX_FEATURES, len(word_freqs)) + 2
- word2index = {x[0]: i+2 for i, x in enumerate(word_freqs.most_common(MAX_FEATURES))}
- word2index["PAD"] = 0
- word2index["UNK"] = 1
- index2word = {v:k for k, v in word2index.items()}
- X = np.empty(num_recs,dtype=list)
- y = np.zeros(num_recs)
- i=0
- with open('train_data.txt','r+') as f:
- for line in f:
- label, sentence = line.strip().split("\t")
- words = nltk.word_tokenize(sentence.lower())
- seqs = []
- for Word in words:
- if Word in word2index:
- seqs.append(word2index[Word])
- else:
- seqs.append(word2index["UNK"])
- X[i] = seqs
- y[i] = int(label)
- i += 1
- X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH)
- ## 数据划分
- Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
- ## 网络构建
- EMBEDDING_SIZE = 128
- HIDDEN_LAYER_SIZE = 64
- BATCH_SIZE = 32
- NUM_EPOCHS = 10
- model = Sequential()
- model.add(Embedding(vocab_size, EMBEDDING_SIZE,input_length=MAX_SENTENCE_LENGTH))
- model.add(LSTM(HIDDEN_LAYER_SIZE, dropout=0.2, recurrent_dropout=0.2))
- model.add(Dense(1))
- model.add(Activation("sigmoid"))
- model.compile(loss="binary_crossentropy", optimizer="adam",metrics=["accuracy"])
- ## 网络训练
- model.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,validation_data=(Xtest, ytest))
- ## 预测
- score, acc = model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE)
- print("\nTest score: %.3f, accuracy: %.3f" % (score, acc))
- print('{} {} {}'.format('预测','真实','句子'))
- for i in range(5):
- idx = np.random.randint(len(Xtest))
- xtest = Xtest[idx].reshape(1,40)
- ylabel = ytest[idx]
- whk =model.predict(xtest)
- print whk
- ypred = model.predict(xtest)[0][0]
- sent = " ".join([index2word[x] for x in xtest[0] if x != 0])
- print('{} {} {}'.format(int(round(ypred)), int(ylabel), sent))
- ##### 自己输入
- INPUT_SENTENCES = ['I love reading.','You are so boring.']
- XX = np.empty(len(INPUT_SENTENCES),dtype=list)
- i=0
- for sentence in INPUT_SENTENCES:
- words = nltk.word_tokenize(sentence.lower())
- seq = []
- for Word in words:
- if Word in word2index:
- seq.append(word2index[Word])
- else:
- seq.append(word2index['UNK'])
- XX[i] = seq
- i+=1
- XX = sequence.pad_sequences(XX, maxlen=MAX_SENTENCE_LENGTH)
- labels = [int(round(x[0])) for x in model.predict(XX) ]
- label2word = {1:'积极', 0:'消极'}
- for i in range(len(INPUT_SENTENCES)):
- print('{} {}'.format(label2word[labels[i]], INPUT_SENTENCES[i]))
- whk = model.predict(XX)
- print whk
txt 中的内容大致如下
- 1 The Da Vinci Code book is just awesome.
- 1 this was the first clive cussler i've ever read, but even books like Relic, and Da Vinci code were more plausible than this.
- 1 i liked the Da Vinci Code a lot.
- 1 i liked the Da Vinci Code a lot.
- 1 I liked the Da Vinci Code but it ultimatly didn't seem to hold it's own.
- 1 that's not even an exaggeration ) and at midnight we went to Wal-Mart to buy the Da Vinci Code, which is amazing of course.
- 1 I loved the Da Vinci Code, but now I want something better and different!..
- 1 i thought da vinci code was great, same with kite runner.
- 1 The Da Vinci Code is actually a good movie...
- 1 I thought the Da Vinci Code was a pretty good book.
- 1 The Da Vinci Code is one of the most beautiful movies ive ever seen.
- 1 The Da Vinci Code is an * amazing * book, do not get me wrong.
- 1 then I turn on the light and the radio and enjoy my Da Vinci Code.
- 1 The Da Vinci Code was REALLY good.
- 1 i love da vinci code....
- 1 i loved da vinci code..
- 1 TO NIGHT:: THE DA VINCI CODE AND A BEAUTIFUL MIND...
- 1 THE DA VINCI CODE is AN AWESOME BOOK....
- 1 Thing is, I enjoyed The Da Vinci Code.
- 1 very da vinci code slash amazing race.
例子 3, 此例子是时间序列预测
- import numpy
- import matplotlib.pyplot as plt
- from pandas import read_csv
- import math
- from keras.models import Sequential
- from keras.layers import Dense
- from keras.layers import LSTM
- from sklearn.preprocessing import MinMaxScaler
- from sklearn.metrics import mean_squared_error
- def create_dataset(dataset, look_back=1):
- dataX, dataY = [], []
- for i in range(len(dataset)-look_back-1):
- a = dataset[i:(i+look_back), 0]
- dataX.append(a)
- dataY.append(dataset[i + look_back, 0])
- return numpy.array(dataX), numpy.array(dataY)
- dataframe = read_csv('international-airline-passengers.CSV', usecols=[1], engine='python', skipfooter=3)
- dataset = dataframe.values
- dataset = dataset.astype('float32')
- train_size = int(len(dataset) * 0.67)
- print dataset
- scaler = MinMaxScaler(feature_range=(0, 1))
- dataset = scaler.fit_transform(dataset)
- test_size = len(dataset) - train_size
- train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
- # reshape into X=t and Y=t+1
- look_back = 1
- trainX, trainY = create_dataset(train, look_back)
- testX, testY = create_dataset(test, look_back)
- trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
- testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
- model = Sequential()
- model.add(LSTM(4, input_shape=(1, look_back)))
- model.add(Dense(1))
- model.compile(loss='mean_squared_error', optimizer='adam')
- model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)
- testPredict = model.predict(testX)
- testPredict = scaler.inverse_transform(testPredict)
- whk=[[112],[390],[622]]
- whk = scaler.fit_transform(whk)
- testOne=numpy.reshape(whk, (3, 1, 1))
- print testOne
- testPredict = model.predict(testOne)
- testPredict = scaler.inverse_transform(testPredict)
- print testPredict
其中 international-airline-passengers.CSV 的内容较少, 大致如下
- "Month","International airline passengers: monthly totals in thousands. Jan 49 ? Dec 60"
- "1949-01",112
- "1949-02",118
- "1949-03",132
- "1949-04",129
- "1949-05",121
- "1949-06",135
- "1949-07",148
- "1949-08",148
- "1949-09",136
- "1949-10",119
- "1949-11",104
- "1949-12",118
- "1950-01",115
- "1950-02",126
- "1950-03",141
- "1950-04",135
- "1950-05",125
- "1950-06",149
- "1950-07",170
- "1950-08",170
- "1950-09",158
- "1950-10",133
- "1950-11",114
- "1950-12",140
- "1951-01",145
- "1951-02",150
- "1951-03",178
- "1951-04",163
- "1951-05",172
- "1951-06",178
来源: http://www.bubuko.com/infodetail-2801111.html