train.py
import tensorflow as tf import numpy as np import os import pickle from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, LSTM, Dropout from tensorflow.keras.callbacks import ModelCheckpoint from string import punctuation sequence_length = 100 BATCH_SIZE = 128 EPOCHS = 3 # dataset file path FILE_PATH = "data/wonderland.txt" # FILE_PATH = "data/python_code.py" BASENAME = os.path.basename(FILE_PATH) # commented because already downloaded # import requests # content = requests.get("http://www.gutenberg.org/cache/epub/11/pg11.txt").text # open("data/wonderland.txt", "w", encoding="utf-8").write(content) # read the data text = open(FILE_PATH, encoding="utf-8").read() # remove caps, comment this code if you want uppercase characters as well text = text.lower() # remove punctuation text = text.translate(str.maketrans("", "", punctuation)) # print some stats n_chars = len(text) vocab = ''.join(sorted(set(text))) print("unique_chars:", vocab) n_unique_chars = len(vocab) print("Number of characters:", n_chars) print("Number of unique characters:", n_unique_chars) # dictionary that converts characters to integers char2int = {c: i for i, c in enumerate(vocab)} # dictionary that converts integers to characters int2char = {i: c for i, c in enumerate(vocab)} # save these dictionaries for later generation pickle.dump(char2int, open(f"{BASENAME}-char2int.pickle", "wb")) pickle.dump(int2char, open(f"{BASENAME}-int2char.pickle", "wb")) # convert all text into integers encoded_text = np.array([char2int[c] for c in text]) # construct tf.data.Dataset object char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text) # print first 5 characters for char in char_dataset.take(8): print(char.numpy(), int2char[char.numpy()]) # build sequences by batching sequences = char_dataset.batch(2*sequence_length + 1, drop_remainder=True) # print sequences for sequence in sequences.take(2): print(''.join([int2char[i] for i in sequence.numpy()])) def split_sample(sample): # example : # sequence_length is 10 # sample is "python is a great pro" (21 length) # ds will equal to ('python is ', 'a') encoded as integers ds = tf.data.Dataset.from_tensors((sample[:sequence_length], sample[sequence_length])) for i in range(1, (len(sample)-1) // 2): # first (input_, target) will be ('ython is a', ' ') # second (input_, target) will be ('thon is a ', 'g') # third (input_, target) will be ('hon is a g', 'r') # and so on input_ = sample[i: i+sequence_length] target = sample[i+sequence_length] # extend the dataset with these samples by concatenate() method other_ds = tf.data.Dataset.from_tensors((input_, target)) ds = ds.concatenate(other_ds) return ds # prepare inputs and targets dataset = sequences.flat_map(split_sample) def one_hot_samples(input_, target): # onehot encode the inputs and the targets # Example: # if character 'd' is encoded as 3 and n_unique_chars = 5 # result should be the vector: [0, 0, 0, 1, 0], since 'd' is the 4th character return tf.one_hot(input_, n_unique_chars), tf.one_hot(target, n_unique_chars) dataset = dataset.map(one_hot_samples) # print first 2 samples for element in dataset.take(2): print("Input:", ''.join([int2char[np.argmax(char_vector)] for char_vector in element[0].numpy()])) print("Target:", int2char[np.argmax(element[1].numpy())]) print("Input shape:", element[0].shape) print("Target shape:", element[1].shape) print("="*50, "\n") # repeat, shuffle and batch the dataset ds = dataset.repeat().shuffle(1024).batch(BATCH_SIZE, drop_remainder=True) # building the model # model = Sequential([ # LSTM(128, input_shape=(sequence_length, n_unique_chars)), # Dense(n_unique_chars, activation="softmax"), # ]) # a better model (slower to train obviously) model = Sequential([ LSTM(256, input_shape=(sequence_length, n_unique_chars), return_sequences=True), Dropout(0.3), LSTM(256), Dense(n_unique_chars, activation="softmax"), ]) model.load_weights(f"results/{BASENAME}-{sequence_length}.h5") model.summary() model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) if not os.path.isdir("results"): os.mkdir("results") # checkpoint = ModelCheckpoint("results/{}-{loss:.2f}.h5".format(BASENAME), verbose=1) # train the model model.fit(ds, steps_per_epoch=(len(encoded_text) - sequence_length) // BATCH_SIZE, epochs=EPOCHS) # save the model model.save(f"results/{BASENAME}-{sequence_length}.h5") generate.py
import numpy as np import pickle import tqdm from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, LSTM, Dropout, Activation import os sequence_length = 100 # dataset file path FILE_PATH = "data/wonderland.txt" # FILE_PATH = "data/python_code.py" BASENAME = os.path.basename(FILE_PATH) # load vocab dictionaries char2int = pickle.load(open(f"{BASENAME}-char2int.pickle", "rb")) int2char = pickle.load(open(f"{BASENAME}-int2char.pickle", "rb")) sequence_length = 100 vocab_size = len(char2int) # building the model model = Sequential([ LSTM(256, input_shape=(sequence_length, vocab_size), return_sequences=True), Dropout(0.3), LSTM(256), Dense(vocab_size, activation="softmax"), ]) # load the optimal weights model.load_weights(f"results/{BASENAME}-{sequence_length}.h5") # specify the feed to first characters to generate seed = "alice is pretty" s = seed n_chars = 400 # generate 400 characters generated = "" for i in tqdm.tqdm(range(n_chars), "Generating text"): # make the input sequence X = np.zeros((1, sequence_length, vocab_size)) for t, char in enumerate(seed): X[0, (sequence_length - len(seed)) + t, char2int[char]] = 1 # predict the next character predicted = model.predict(X, verbose=0)[0] # converting the vector to an integer next_index = np.argmax(predicted) # converting the integer to a character next_char = int2char[next_index] # add the character to results generated += next_char # shift seed and the predicted character seed = seed[1:] + next_char print("Seed:", s) print("Generated text:") print(generated)