Allow generating poems from raw text
The filename of the raw text is supplied via the first command line argument to the python script call.
This commit is contained in:
parent
8c4b8eaaee
commit
fa8bd171a1
@ -1,3 +1,4 @@
|
||||
import codecs
|
||||
import nltk
|
||||
import random
|
||||
import re
|
||||
@ -32,22 +33,31 @@ class PoemGenerator():
|
||||
self.words = []
|
||||
self.all_words = []
|
||||
self.inflect_engine = inflect.engine()
|
||||
with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
|
||||
reader = csv.reader(statuses, delimiter=',')
|
||||
for row in reader:
|
||||
if 'via buzzfeed ' not in row[1].lower(): # only English
|
||||
# split title into a list of words and punctuation
|
||||
title = self.spaces_and_punctuation.findall(row[2])
|
||||
# spell out digits into ordinal words for syllable counting
|
||||
title = [string.capwords(
|
||||
self.inflect_engine.number_to_words(int(word)))
|
||||
if word.isdigit() else word for word in title]
|
||||
self.sents.append(title)
|
||||
self.words.extend(title)
|
||||
# all_words only contains words, no punctuation
|
||||
self.all_words.extend([word for word in title
|
||||
if not
|
||||
self.only_punctuation.match(word)])
|
||||
# with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
|
||||
# reader = csv.reader(statuses, delimiter=',')
|
||||
# for row in reader:
|
||||
# if 'via buzzfeed ' not in row[1].lower(): # only English
|
||||
# # split title into a list of words and punctuation
|
||||
# title = self.spaces_and_punctuation.findall(row[2])
|
||||
# # spell out digits into ordinal words for syllable counting
|
||||
# title = [string.capwords(
|
||||
# self.inflect_engine.number_to_words(int(word)))
|
||||
# if word.isdigit() else word for word in title]
|
||||
# self.sents.append(title)
|
||||
# self.words.extend(title)
|
||||
# # all_words only contains words, no punctuation
|
||||
# self.all_words.extend([word for word in title
|
||||
# if not
|
||||
# self.only_punctuation.match(word)])
|
||||
with codecs.open('trump.txt', 'r', 'utf-8') as corpus:
|
||||
text = corpus.read()
|
||||
sents = nltk.tokenize.sent_tokenize(text)
|
||||
words = nltk.tokenize.word_tokenize(text)
|
||||
self.sents.extend(sents)
|
||||
self.words.extend(words)
|
||||
self.all_words.extend([word for word in words
|
||||
if not
|
||||
self.only_punctuation.match(word)])
|
||||
self.bigrams = list(nltk.bigrams(self.words))
|
||||
self.cfd = nltk.ConditionalFreqDist(self.bigrams)
|
||||
#self.parser = Parser()
|
||||
@ -129,7 +139,8 @@ class PoemGenerator():
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
generator = PoemGenerator(nltk.corpus.gutenberg)
|
||||
generator = PoemGenerator('poop')
|
||||
#generator.generate_poem()
|
||||
generator.generate_haiku()
|
||||
haiku = generator.generate_haiku()
|
||||
print haiku
|
||||
#generator.generate_endless_poem(None)
|
||||
|
20
json_to_txt.py
Normal file
20
json_to_txt.py
Normal file
@ -0,0 +1,20 @@
|
||||
# Converts a json twitter dump to raw text file.
|
||||
import codecs
|
||||
import json
|
||||
import sys
|
||||
|
||||
|
||||
def get_text_from_json(filename):
|
||||
with codecs.open(filename, 'r', 'utf-8') as f:
|
||||
return [item['text'] for item in json.loads(f.read())]
|
||||
|
||||
|
||||
def write_text_to_file(filename, text_array, delimiter=' '):
|
||||
text_to_write = delimiter.join(text_array)
|
||||
with codecs.open(filename, 'w', 'utf-8') as f:
|
||||
f.write(text_to_write)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
text_array = get_text_from_json(sys.argv[1])
|
||||
write_text_to_file(sys.argv[2], text_array)
|
@ -5,6 +5,7 @@ import pickle
|
||||
import random
|
||||
import re
|
||||
import codecs
|
||||
import sys
|
||||
from nltk.tree import Tree
|
||||
from collections import defaultdict
|
||||
from tqdm import tqdm
|
||||
@ -28,13 +29,13 @@ Tree.__hash__ = tree_hash
|
||||
# corpora. Shitty bus wifi makes it hard to download spacy data and look up the docs.
|
||||
|
||||
|
||||
def generate():
|
||||
def generate(filename):
|
||||
global syntaxes
|
||||
parser = Parser()
|
||||
if not os.path.exists(SYNTAXES_FILE):
|
||||
# sents = nltk.corpus.gutenberg.sents('results.txt')
|
||||
# NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus.
|
||||
with codecs.open('results.txt', encoding='utf-8') as corpus:
|
||||
with codecs.open(filename, encoding='utf-8') as corpus:
|
||||
sents = nltk.sent_tokenize(corpus.read())
|
||||
sents = [sent for sent in sents if len(sent) < 150][0:1500]
|
||||
for sent in tqdm(sents):
|
||||
@ -50,8 +51,7 @@ def generate():
|
||||
syntaxes = pickle.load(pickle_file)
|
||||
|
||||
if not os.path.exists(CFDS_FILE):
|
||||
# corpus = nltk.corpus.gutenberg.raw('results.txt')
|
||||
with codecs.open('results.txt', encoding='utf-8') as corpus:
|
||||
with codecs.open(filename, encoding='utf-8') as corpus:
|
||||
cfds = [make_cfd(corpus.read(), i, exclude_punctuation=False, case_insensitive=True) for i in range(2, 5)]
|
||||
with open(CFDS_FILE, 'wb+') as pickle_file:
|
||||
pickle.dump(cfds, pickle_file)
|
||||
@ -165,4 +165,4 @@ def get_most_common(search, cfds, most_common=None):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
generate()
|
||||
generate(sys.argv[1])
|
||||
|
Loading…
Reference in New Issue
Block a user