Allow generating poems from raw text

The filename of the raw text is supplied via the first command line argument to
the python script call.
This commit is contained in:
Tyler Hallada 2017-03-14 01:03:23 -04:00
parent 8c4b8eaaee
commit fa8bd171a1
3 changed files with 54 additions and 23 deletions

View File

@ -1,3 +1,4 @@
import codecs
import nltk import nltk
import random import random
import re import re
@ -32,22 +33,31 @@ class PoemGenerator():
self.words = [] self.words = []
self.all_words = [] self.all_words = []
self.inflect_engine = inflect.engine() self.inflect_engine = inflect.engine()
with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses: # with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
reader = csv.reader(statuses, delimiter=',') # reader = csv.reader(statuses, delimiter=',')
for row in reader: # for row in reader:
if 'via buzzfeed ' not in row[1].lower(): # only English # if 'via buzzfeed ' not in row[1].lower(): # only English
# split title into a list of words and punctuation # # split title into a list of words and punctuation
title = self.spaces_and_punctuation.findall(row[2]) # title = self.spaces_and_punctuation.findall(row[2])
# spell out digits into ordinal words for syllable counting # # spell out digits into ordinal words for syllable counting
title = [string.capwords( # title = [string.capwords(
self.inflect_engine.number_to_words(int(word))) # self.inflect_engine.number_to_words(int(word)))
if word.isdigit() else word for word in title] # if word.isdigit() else word for word in title]
self.sents.append(title) # self.sents.append(title)
self.words.extend(title) # self.words.extend(title)
# all_words only contains words, no punctuation # # all_words only contains words, no punctuation
self.all_words.extend([word for word in title # self.all_words.extend([word for word in title
if not # if not
self.only_punctuation.match(word)]) # self.only_punctuation.match(word)])
with codecs.open('trump.txt', 'r', 'utf-8') as corpus:
text = corpus.read()
sents = nltk.tokenize.sent_tokenize(text)
words = nltk.tokenize.word_tokenize(text)
self.sents.extend(sents)
self.words.extend(words)
self.all_words.extend([word for word in words
if not
self.only_punctuation.match(word)])
self.bigrams = list(nltk.bigrams(self.words)) self.bigrams = list(nltk.bigrams(self.words))
self.cfd = nltk.ConditionalFreqDist(self.bigrams) self.cfd = nltk.ConditionalFreqDist(self.bigrams)
#self.parser = Parser() #self.parser = Parser()
@ -129,7 +139,8 @@ class PoemGenerator():
if __name__ == '__main__': if __name__ == '__main__':
generator = PoemGenerator(nltk.corpus.gutenberg) generator = PoemGenerator('poop')
#generator.generate_poem() #generator.generate_poem()
generator.generate_haiku() haiku = generator.generate_haiku()
print haiku
#generator.generate_endless_poem(None) #generator.generate_endless_poem(None)

20
json_to_txt.py Normal file
View File

@ -0,0 +1,20 @@
# Converts a json twitter dump to raw text file.
import codecs
import json
import sys
def get_text_from_json(filename):
with codecs.open(filename, 'r', 'utf-8') as f:
return [item['text'] for item in json.loads(f.read())]
def write_text_to_file(filename, text_array, delimiter=' '):
text_to_write = delimiter.join(text_array)
with codecs.open(filename, 'w', 'utf-8') as f:
f.write(text_to_write)
if __name__ == '__main__':
text_array = get_text_from_json(sys.argv[1])
write_text_to_file(sys.argv[2], text_array)

View File

@ -5,6 +5,7 @@ import pickle
import random import random
import re import re
import codecs import codecs
import sys
from nltk.tree import Tree from nltk.tree import Tree
from collections import defaultdict from collections import defaultdict
from tqdm import tqdm from tqdm import tqdm
@ -28,13 +29,13 @@ Tree.__hash__ = tree_hash
# corpora. Shitty bus wifi makes it hard to download spacy data and look up the docs. # corpora. Shitty bus wifi makes it hard to download spacy data and look up the docs.
def generate(): def generate(filename):
global syntaxes global syntaxes
parser = Parser() parser = Parser()
if not os.path.exists(SYNTAXES_FILE): if not os.path.exists(SYNTAXES_FILE):
# sents = nltk.corpus.gutenberg.sents('results.txt') # sents = nltk.corpus.gutenberg.sents('results.txt')
# NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus. # NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus.
with codecs.open('results.txt', encoding='utf-8') as corpus: with codecs.open(filename, encoding='utf-8') as corpus:
sents = nltk.sent_tokenize(corpus.read()) sents = nltk.sent_tokenize(corpus.read())
sents = [sent for sent in sents if len(sent) < 150][0:1500] sents = [sent for sent in sents if len(sent) < 150][0:1500]
for sent in tqdm(sents): for sent in tqdm(sents):
@ -50,8 +51,7 @@ def generate():
syntaxes = pickle.load(pickle_file) syntaxes = pickle.load(pickle_file)
if not os.path.exists(CFDS_FILE): if not os.path.exists(CFDS_FILE):
# corpus = nltk.corpus.gutenberg.raw('results.txt') with codecs.open(filename, encoding='utf-8') as corpus:
with codecs.open('results.txt', encoding='utf-8') as corpus:
cfds = [make_cfd(corpus.read(), i, exclude_punctuation=False, case_insensitive=True) for i in range(2, 5)] cfds = [make_cfd(corpus.read(), i, exclude_punctuation=False, case_insensitive=True) for i in range(2, 5)]
with open(CFDS_FILE, 'wb+') as pickle_file: with open(CFDS_FILE, 'wb+') as pickle_file:
pickle.dump(cfds, pickle_file) pickle.dump(cfds, pickle_file)
@ -165,4 +165,4 @@ def get_most_common(search, cfds, most_common=None):
if __name__ == '__main__': if __name__ == '__main__':
generate() generate(sys.argv[1])