|
@@ -1,3 +1,4 @@
|
|
1
|
+import codecs
|
1
|
2
|
import nltk
|
2
|
3
|
import random
|
3
|
4
|
import re
|
|
@@ -32,22 +33,31 @@ class PoemGenerator():
|
32
|
33
|
self.words = []
|
33
|
34
|
self.all_words = []
|
34
|
35
|
self.inflect_engine = inflect.engine()
|
35
|
|
- with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
|
36
|
|
- reader = csv.reader(statuses, delimiter=',')
|
37
|
|
- for row in reader:
|
38
|
|
- if 'via buzzfeed ' not in row[1].lower(): # only English
|
39
|
|
- # split title into a list of words and punctuation
|
40
|
|
- title = self.spaces_and_punctuation.findall(row[2])
|
41
|
|
- # spell out digits into ordinal words for syllable counting
|
42
|
|
- title = [string.capwords(
|
43
|
|
- self.inflect_engine.number_to_words(int(word)))
|
44
|
|
- if word.isdigit() else word for word in title]
|
45
|
|
- self.sents.append(title)
|
46
|
|
- self.words.extend(title)
|
47
|
|
- # all_words only contains words, no punctuation
|
48
|
|
- self.all_words.extend([word for word in title
|
49
|
|
- if not
|
50
|
|
- self.only_punctuation.match(word)])
|
|
36
|
+ # with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
|
|
37
|
+ # reader = csv.reader(statuses, delimiter=',')
|
|
38
|
+ # for row in reader:
|
|
39
|
+ # if 'via buzzfeed ' not in row[1].lower(): # only English
|
|
40
|
+ # # split title into a list of words and punctuation
|
|
41
|
+ # title = self.spaces_and_punctuation.findall(row[2])
|
|
42
|
+ # # spell out digits into ordinal words for syllable counting
|
|
43
|
+ # title = [string.capwords(
|
|
44
|
+ # self.inflect_engine.number_to_words(int(word)))
|
|
45
|
+ # if word.isdigit() else word for word in title]
|
|
46
|
+ # self.sents.append(title)
|
|
47
|
+ # self.words.extend(title)
|
|
48
|
+ # # all_words only contains words, no punctuation
|
|
49
|
+ # self.all_words.extend([word for word in title
|
|
50
|
+ # if not
|
|
51
|
+ # self.only_punctuation.match(word)])
|
|
52
|
+ with codecs.open('trump.txt', 'r', 'utf-8') as corpus:
|
|
53
|
+ text = corpus.read()
|
|
54
|
+ sents = nltk.tokenize.sent_tokenize(text)
|
|
55
|
+ words = nltk.tokenize.word_tokenize(text)
|
|
56
|
+ self.sents.extend(sents)
|
|
57
|
+ self.words.extend(words)
|
|
58
|
+ self.all_words.extend([word for word in words
|
|
59
|
+ if not
|
|
60
|
+ self.only_punctuation.match(word)])
|
51
|
61
|
self.bigrams = list(nltk.bigrams(self.words))
|
52
|
62
|
self.cfd = nltk.ConditionalFreqDist(self.bigrams)
|
53
|
63
|
#self.parser = Parser()
|
|
@@ -129,7 +139,8 @@ class PoemGenerator():
|
129
|
139
|
|
130
|
140
|
|
131
|
141
|
if __name__ == '__main__':
|
132
|
|
- generator = PoemGenerator(nltk.corpus.gutenberg)
|
|
142
|
+ generator = PoemGenerator('poop')
|
133
|
143
|
#generator.generate_poem()
|
134
|
|
- generator.generate_haiku()
|
|
144
|
+ haiku = generator.generate_haiku()
|
|
145
|
+ print haiku
|
135
|
146
|
#generator.generate_endless_poem(None)
|