Allow generating poems from raw text
The filename of the raw text is supplied via the first command line argument to the python script call.
This commit is contained in:
parent
8c4b8eaaee
commit
fa8bd171a1
@ -1,3 +1,4 @@
|
|||||||
|
import codecs
|
||||||
import nltk
|
import nltk
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
@ -32,22 +33,31 @@ class PoemGenerator():
|
|||||||
self.words = []
|
self.words = []
|
||||||
self.all_words = []
|
self.all_words = []
|
||||||
self.inflect_engine = inflect.engine()
|
self.inflect_engine = inflect.engine()
|
||||||
with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
|
# with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
|
||||||
reader = csv.reader(statuses, delimiter=',')
|
# reader = csv.reader(statuses, delimiter=',')
|
||||||
for row in reader:
|
# for row in reader:
|
||||||
if 'via buzzfeed ' not in row[1].lower(): # only English
|
# if 'via buzzfeed ' not in row[1].lower(): # only English
|
||||||
# split title into a list of words and punctuation
|
# # split title into a list of words and punctuation
|
||||||
title = self.spaces_and_punctuation.findall(row[2])
|
# title = self.spaces_and_punctuation.findall(row[2])
|
||||||
# spell out digits into ordinal words for syllable counting
|
# # spell out digits into ordinal words for syllable counting
|
||||||
title = [string.capwords(
|
# title = [string.capwords(
|
||||||
self.inflect_engine.number_to_words(int(word)))
|
# self.inflect_engine.number_to_words(int(word)))
|
||||||
if word.isdigit() else word for word in title]
|
# if word.isdigit() else word for word in title]
|
||||||
self.sents.append(title)
|
# self.sents.append(title)
|
||||||
self.words.extend(title)
|
# self.words.extend(title)
|
||||||
# all_words only contains words, no punctuation
|
# # all_words only contains words, no punctuation
|
||||||
self.all_words.extend([word for word in title
|
# self.all_words.extend([word for word in title
|
||||||
if not
|
# if not
|
||||||
self.only_punctuation.match(word)])
|
# self.only_punctuation.match(word)])
|
||||||
|
with codecs.open('trump.txt', 'r', 'utf-8') as corpus:
|
||||||
|
text = corpus.read()
|
||||||
|
sents = nltk.tokenize.sent_tokenize(text)
|
||||||
|
words = nltk.tokenize.word_tokenize(text)
|
||||||
|
self.sents.extend(sents)
|
||||||
|
self.words.extend(words)
|
||||||
|
self.all_words.extend([word for word in words
|
||||||
|
if not
|
||||||
|
self.only_punctuation.match(word)])
|
||||||
self.bigrams = list(nltk.bigrams(self.words))
|
self.bigrams = list(nltk.bigrams(self.words))
|
||||||
self.cfd = nltk.ConditionalFreqDist(self.bigrams)
|
self.cfd = nltk.ConditionalFreqDist(self.bigrams)
|
||||||
#self.parser = Parser()
|
#self.parser = Parser()
|
||||||
@ -129,7 +139,8 @@ class PoemGenerator():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
generator = PoemGenerator(nltk.corpus.gutenberg)
|
generator = PoemGenerator('poop')
|
||||||
#generator.generate_poem()
|
#generator.generate_poem()
|
||||||
generator.generate_haiku()
|
haiku = generator.generate_haiku()
|
||||||
|
print haiku
|
||||||
#generator.generate_endless_poem(None)
|
#generator.generate_endless_poem(None)
|
||||||
|
20
json_to_txt.py
Normal file
20
json_to_txt.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# Converts a json twitter dump to raw text file.
|
||||||
|
import codecs
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def get_text_from_json(filename):
|
||||||
|
with codecs.open(filename, 'r', 'utf-8') as f:
|
||||||
|
return [item['text'] for item in json.loads(f.read())]
|
||||||
|
|
||||||
|
|
||||||
|
def write_text_to_file(filename, text_array, delimiter=' '):
|
||||||
|
text_to_write = delimiter.join(text_array)
|
||||||
|
with codecs.open(filename, 'w', 'utf-8') as f:
|
||||||
|
f.write(text_to_write)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
text_array = get_text_from_json(sys.argv[1])
|
||||||
|
write_text_to_file(sys.argv[2], text_array)
|
@ -5,6 +5,7 @@ import pickle
|
|||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
import codecs
|
import codecs
|
||||||
|
import sys
|
||||||
from nltk.tree import Tree
|
from nltk.tree import Tree
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
@ -28,13 +29,13 @@ Tree.__hash__ = tree_hash
|
|||||||
# corpora. Shitty bus wifi makes it hard to download spacy data and look up the docs.
|
# corpora. Shitty bus wifi makes it hard to download spacy data and look up the docs.
|
||||||
|
|
||||||
|
|
||||||
def generate():
|
def generate(filename):
|
||||||
global syntaxes
|
global syntaxes
|
||||||
parser = Parser()
|
parser = Parser()
|
||||||
if not os.path.exists(SYNTAXES_FILE):
|
if not os.path.exists(SYNTAXES_FILE):
|
||||||
# sents = nltk.corpus.gutenberg.sents('results.txt')
|
# sents = nltk.corpus.gutenberg.sents('results.txt')
|
||||||
# NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus.
|
# NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus.
|
||||||
with codecs.open('results.txt', encoding='utf-8') as corpus:
|
with codecs.open(filename, encoding='utf-8') as corpus:
|
||||||
sents = nltk.sent_tokenize(corpus.read())
|
sents = nltk.sent_tokenize(corpus.read())
|
||||||
sents = [sent for sent in sents if len(sent) < 150][0:1500]
|
sents = [sent for sent in sents if len(sent) < 150][0:1500]
|
||||||
for sent in tqdm(sents):
|
for sent in tqdm(sents):
|
||||||
@ -50,8 +51,7 @@ def generate():
|
|||||||
syntaxes = pickle.load(pickle_file)
|
syntaxes = pickle.load(pickle_file)
|
||||||
|
|
||||||
if not os.path.exists(CFDS_FILE):
|
if not os.path.exists(CFDS_FILE):
|
||||||
# corpus = nltk.corpus.gutenberg.raw('results.txt')
|
with codecs.open(filename, encoding='utf-8') as corpus:
|
||||||
with codecs.open('results.txt', encoding='utf-8') as corpus:
|
|
||||||
cfds = [make_cfd(corpus.read(), i, exclude_punctuation=False, case_insensitive=True) for i in range(2, 5)]
|
cfds = [make_cfd(corpus.read(), i, exclude_punctuation=False, case_insensitive=True) for i in range(2, 5)]
|
||||||
with open(CFDS_FILE, 'wb+') as pickle_file:
|
with open(CFDS_FILE, 'wb+') as pickle_file:
|
||||||
pickle.dump(cfds, pickle_file)
|
pickle.dump(cfds, pickle_file)
|
||||||
@ -165,4 +165,4 @@ def get_most_common(search, cfds, most_common=None):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
generate()
|
generate(sys.argv[1])
|
||||||
|
Loading…
Reference in New Issue
Block a user