Allow generating poems from raw text

The filename of the raw text is supplied via the first command line argument to
the python script call.
This commit is contained in:
Tyler Hallada 2017-03-14 01:03:23 -04:00
parent 8c4b8eaaee
commit fa8bd171a1
3 changed files with 54 additions and 23 deletions

View File

@ -1,3 +1,4 @@
import codecs
import nltk
import random
import re
@ -32,20 +33,29 @@ class PoemGenerator():
self.words = []
self.all_words = []
self.inflect_engine = inflect.engine()
with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
reader = csv.reader(statuses, delimiter=',')
for row in reader:
if 'via buzzfeed ' not in row[1].lower(): # only English
# split title into a list of words and punctuation
title = self.spaces_and_punctuation.findall(row[2])
# spell out digits into ordinal words for syllable counting
title = [string.capwords(
self.inflect_engine.number_to_words(int(word)))
if word.isdigit() else word for word in title]
self.sents.append(title)
self.words.extend(title)
# all_words only contains words, no punctuation
self.all_words.extend([word for word in title
# with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
# reader = csv.reader(statuses, delimiter=',')
# for row in reader:
# if 'via buzzfeed ' not in row[1].lower(): # only English
# # split title into a list of words and punctuation
# title = self.spaces_and_punctuation.findall(row[2])
# # spell out digits into ordinal words for syllable counting
# title = [string.capwords(
# self.inflect_engine.number_to_words(int(word)))
# if word.isdigit() else word for word in title]
# self.sents.append(title)
# self.words.extend(title)
# # all_words only contains words, no punctuation
# self.all_words.extend([word for word in title
# if not
# self.only_punctuation.match(word)])
with codecs.open('trump.txt', 'r', 'utf-8') as corpus:
text = corpus.read()
sents = nltk.tokenize.sent_tokenize(text)
words = nltk.tokenize.word_tokenize(text)
self.sents.extend(sents)
self.words.extend(words)
self.all_words.extend([word for word in words
if not
self.only_punctuation.match(word)])
self.bigrams = list(nltk.bigrams(self.words))
@ -129,7 +139,8 @@ class PoemGenerator():
if __name__ == '__main__':
generator = PoemGenerator(nltk.corpus.gutenberg)
generator = PoemGenerator('poop')
#generator.generate_poem()
generator.generate_haiku()
haiku = generator.generate_haiku()
print haiku
#generator.generate_endless_poem(None)

20
json_to_txt.py Normal file
View File

@ -0,0 +1,20 @@
# Converts a json twitter dump to raw text file.
import codecs
import json
import sys
def get_text_from_json(filename):
with codecs.open(filename, 'r', 'utf-8') as f:
return [item['text'] for item in json.loads(f.read())]
def write_text_to_file(filename, text_array, delimiter=' '):
text_to_write = delimiter.join(text_array)
with codecs.open(filename, 'w', 'utf-8') as f:
f.write(text_to_write)
if __name__ == '__main__':
text_array = get_text_from_json(sys.argv[1])
write_text_to_file(sys.argv[2], text_array)

View File

@ -5,6 +5,7 @@ import pickle
import random
import re
import codecs
import sys
from nltk.tree import Tree
from collections import defaultdict
from tqdm import tqdm
@ -28,13 +29,13 @@ Tree.__hash__ = tree_hash
# corpora. Shitty bus wifi makes it hard to download spacy data and look up the docs.
def generate():
def generate(filename):
global syntaxes
parser = Parser()
if not os.path.exists(SYNTAXES_FILE):
# sents = nltk.corpus.gutenberg.sents('results.txt')
# NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus.
with codecs.open('results.txt', encoding='utf-8') as corpus:
with codecs.open(filename, encoding='utf-8') as corpus:
sents = nltk.sent_tokenize(corpus.read())
sents = [sent for sent in sents if len(sent) < 150][0:1500]
for sent in tqdm(sents):
@ -50,8 +51,7 @@ def generate():
syntaxes = pickle.load(pickle_file)
if not os.path.exists(CFDS_FILE):
# corpus = nltk.corpus.gutenberg.raw('results.txt')
with codecs.open('results.txt', encoding='utf-8') as corpus:
with codecs.open(filename, encoding='utf-8') as corpus:
cfds = [make_cfd(corpus.read(), i, exclude_punctuation=False, case_insensitive=True) for i in range(2, 5)]
with open(CFDS_FILE, 'wb+') as pickle_file:
pickle.dump(cfds, pickle_file)
@ -165,4 +165,4 @@ def get_most_common(search, cfds, most_common=None):
if __name__ == '__main__':
generate()
generate(sys.argv[1])