Commit 0db2db0d authored by Dylan Zucker's avatar Dylan Zucker

README

parent b490dc53
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="projectConfiguration" value="Nosetests" />
<option name="PROJECT_TEST_RUNNER" value="Nosetests" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.2 (/usr/local/anaconda-3.6/bin/python3.6)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/AICogSciFinalProject.iml" filepath="$PROJECT_DIR$/.idea/AICogSciFinalProject.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
This diff is collapsed.
Initial Commit
\ No newline at end of file
In the classifiers folder, there is the file text_gen_word_based.py. That file holds our model creation and training.
In that file you can change which data you are using, by switching the FILE_PREFIX constant at the top of the file to
match the file name of the data you want to use. For example: if you want to use Harry Potter 7, for training or text
generation, you can see the name of the data file is HP7.txt. So, change FILE_PREFIX to HP7, and it will be a
Harry Potter 7 based model.
Our data sets are: a subset of the bible in Bible.txt, Harry Potter 7 in HP7, and a mix of Tupac lyrics and William
Shakespeare's Sonnets.
The classifier outputs graphs of its training to the figures folder, and outputs its saved weights to the weights
folder after it is done training (some weights also update while training). The names of the weight files coorespond
to the FILE_PREFIX constant.
\ No newline at end of file
This diff is collapsed.
"""
Dylan Zucker, Ryan Pasculano, Melanie Cheng & David Schwartz
CSCI 379 - Fall 2018
Intro to AI & Cog Sci
Final Project
This file trains a model to generate Harry Potter 7 type text.
"""
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import RNN
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
text = (open("../data/HP7.txt").read())
text=text.lower()
characters = sorted(list(set(text)))
n_to_char = {n:char for n, char in enumerate(characters)}
char_to_n = {char:n for n, char in enumerate(characters)}
X = []
Y = []
length = len(text)
seq_length = 100
for i in range(length//300-seq_length):
sequence = text[i:i + seq_length]
label =text[i + seq_length]
X.append([char_to_n[char] for char in sequence])
Y.append(char_to_n[label])
X_modified = np.reshape(X, (len(X), seq_length, 1))
X_modified = X_modified / float(len(characters))
Y_modified = np_utils.to_categorical(Y)
model = Sequential()
model.add(LSTM(700, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(700, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(700))
model.add(Dropout(0.2))
model.add(Dense(Y_modified.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam') # adam optimizes the training constant alpha as it trains
choice = input("Input (Y) if you want to train. Input anything else if you just want to generate text: ")
if choice == "Y" or choice == "y":
filepath="../weights/weights-big.best.hdf5"
checkpoint = ModelCheckpoint(filepath, verbose=1, mode='max')
callbacks_list = [checkpoint]
model.fit(X_modified, Y_modified, epochs=100, batch_size=64, callbacks=callbacks_list, verbose = 2) # 64 cause system is optimized for powers of 2. Def fact check
model.save_weights('../weights/text_generator_big.h5')
model.load_weights('../weights/text_generator_big_original.h5')
string_mapped = X[99]
full_string = "Hobbits are an unobtrusive but very ancient people, more numerous formerly than they are today; for they love peace and quiet and good tilled earth: a well-ordered and well-farmed countryside was their favourite haunt. They do not and did not understand or like machines more complicated than a forge-bellows, a water-mill, or a hand-loom, though they were skilful with tools."
full_string = [full_string[i] for i in range(seq_length)]
#full_string = [n_to_char[value] for value in string_mapped]
for i in range(500):
x = np.reshape(string_mapped,(1,len(string_mapped), 1))
x = x / float(len(characters))
pred_index = np.argmax(model.predict(x, verbose=0))
seq = [n_to_char[value] for value in string_mapped]
full_string.append(n_to_char[pred_index])
string_mapped.append(pred_index)
string_mapped = string_mapped[1:len(string_mapped)]
txt=""
for char in full_string:
txt = txt+char
#"".join(full_string)
print(txt)
\ No newline at end of file
This diff is collapsed.
......@@ -9,7 +9,7 @@ This file trains a model to generate Harry Potter 7 type text.
# TODO Make more directories to clean the directory up
FILE_PREFIX = "Bible" # If you want to change the training data just change this to the file name before txt
FILE_PREFIX = "HP7" # If you want to change the training data just change this to the file name before txt
import string
from keras.callbacks import ModelCheckpoint
......@@ -64,8 +64,6 @@ def generation(model):
seed_text = input("Input your seed text needs to be length 50 words: ")
seq_length = len(seed_text.split()[:50])
# model = load_model('text_generator_word_based.h5')
# model.load_weights('weights-word_based.best.hdf5')
model.load_weights("../weights/text_generator_word_based_" + FILE_PREFIX + ".h5")
tokenizer = load(open('tokenizer_' + FILE_PREFIX + '.pkl', 'rb'))
# generate new text
......@@ -248,7 +246,7 @@ def main():
else:
seed_text, generated = generation(model)
print(seed_text)
print(" ".join(seed_text.split()[:50]))
print()
print(generated)
......
This diff is collapsed.
File added
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment