diff --git a/generate_poem.py b/generate_poem.py index f724327..93689df 100644 --- a/generate_poem.py +++ b/generate_poem.py @@ -1,3 +1,4 @@ +import codecs import nltk import random import re @@ -32,22 +33,31 @@ class PoemGenerator(): self.words = [] self.all_words = [] self.inflect_engine = inflect.engine() - with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses: - reader = csv.reader(statuses, delimiter=',') - for row in reader: - if 'via buzzfeed ' not in row[1].lower(): # only English - # split title into a list of words and punctuation - title = self.spaces_and_punctuation.findall(row[2]) - # spell out digits into ordinal words for syllable counting - title = [string.capwords( - self.inflect_engine.number_to_words(int(word))) - if word.isdigit() else word for word in title] - self.sents.append(title) - self.words.extend(title) - # all_words only contains words, no punctuation - self.all_words.extend([word for word in title - if not - self.only_punctuation.match(word)]) + # with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses: + # reader = csv.reader(statuses, delimiter=',') + # for row in reader: + # if 'via buzzfeed ' not in row[1].lower(): # only English + # # split title into a list of words and punctuation + # title = self.spaces_and_punctuation.findall(row[2]) + # # spell out digits into ordinal words for syllable counting + # title = [string.capwords( + # self.inflect_engine.number_to_words(int(word))) + # if word.isdigit() else word for word in title] + # self.sents.append(title) + # self.words.extend(title) + # # all_words only contains words, no punctuation + # self.all_words.extend([word for word in title + # if not + # self.only_punctuation.match(word)]) + with codecs.open('trump.txt', 'r', 'utf-8') as corpus: + text = corpus.read() + sents = nltk.tokenize.sent_tokenize(text) + words = nltk.tokenize.word_tokenize(text) + self.sents.extend(sents) + self.words.extend(words) + self.all_words.extend([word for word in words + if not + self.only_punctuation.match(word)]) self.bigrams = list(nltk.bigrams(self.words)) self.cfd = nltk.ConditionalFreqDist(self.bigrams) #self.parser = Parser() @@ -129,7 +139,8 @@ class PoemGenerator(): if __name__ == '__main__': - generator = PoemGenerator(nltk.corpus.gutenberg) + generator = PoemGenerator('poop') #generator.generate_poem() - generator.generate_haiku() + haiku = generator.generate_haiku() + print haiku #generator.generate_endless_poem(None) diff --git a/json_to_txt.py b/json_to_txt.py new file mode 100644 index 0000000..77954bc --- /dev/null +++ b/json_to_txt.py @@ -0,0 +1,20 @@ +# Converts a json twitter dump to raw text file. +import codecs +import json +import sys + + +def get_text_from_json(filename): + with codecs.open(filename, 'r', 'utf-8') as f: + return [item['text'] for item in json.loads(f.read())] + + +def write_text_to_file(filename, text_array, delimiter=' '): + text_to_write = delimiter.join(text_array) + with codecs.open(filename, 'w', 'utf-8') as f: + f.write(text_to_write) + + +if __name__ == '__main__': + text_array = get_text_from_json(sys.argv[1]) + write_text_to_file(sys.argv[2], text_array) diff --git a/syntax_aware_generate.py b/syntax_aware_generate.py index 1c7545a..7958f0d 100644 --- a/syntax_aware_generate.py +++ b/syntax_aware_generate.py @@ -5,6 +5,7 @@ import pickle import random import re import codecs +import sys from nltk.tree import Tree from collections import defaultdict from tqdm import tqdm @@ -28,13 +29,13 @@ Tree.__hash__ = tree_hash # corpora. Shitty bus wifi makes it hard to download spacy data and look up the docs. -def generate(): +def generate(filename): global syntaxes parser = Parser() if not os.path.exists(SYNTAXES_FILE): # sents = nltk.corpus.gutenberg.sents('results.txt') # NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus. - with codecs.open('results.txt', encoding='utf-8') as corpus: + with codecs.open(filename, encoding='utf-8') as corpus: sents = nltk.sent_tokenize(corpus.read()) sents = [sent for sent in sents if len(sent) < 150][0:1500] for sent in tqdm(sents): @@ -50,8 +51,7 @@ def generate(): syntaxes = pickle.load(pickle_file) if not os.path.exists(CFDS_FILE): - # corpus = nltk.corpus.gutenberg.raw('results.txt') - with codecs.open('results.txt', encoding='utf-8') as corpus: + with codecs.open(filename, encoding='utf-8') as corpus: cfds = [make_cfd(corpus.read(), i, exclude_punctuation=False, case_insensitive=True) for i in range(2, 5)] with open(CFDS_FILE, 'wb+') as pickle_file: pickle.dump(cfds, pickle_file) @@ -165,4 +165,4 @@ def get_most_common(search, cfds, most_common=None): if __name__ == '__main__': - generate() + generate(sys.argv[1])