|
@@ -33,31 +33,31 @@ class PoemGenerator():
|
33
|
33
|
self.words = []
|
34
|
34
|
self.all_words = []
|
35
|
35
|
self.inflect_engine = inflect.engine()
|
36
|
|
- # with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
|
37
|
|
- # reader = csv.reader(statuses, delimiter=',')
|
38
|
|
- # for row in reader:
|
39
|
|
- # if 'via buzzfeed ' not in row[1].lower(): # only English
|
40
|
|
- # # split title into a list of words and punctuation
|
41
|
|
- # title = self.spaces_and_punctuation.findall(row[2])
|
42
|
|
- # # spell out digits into ordinal words for syllable counting
|
43
|
|
- # title = [string.capwords(
|
44
|
|
- # self.inflect_engine.number_to_words(int(word)))
|
45
|
|
- # if word.isdigit() else word for word in title]
|
46
|
|
- # self.sents.append(title)
|
47
|
|
- # self.words.extend(title)
|
48
|
|
- # # all_words only contains words, no punctuation
|
49
|
|
- # self.all_words.extend([word for word in title
|
50
|
|
- # if not
|
51
|
|
- # self.only_punctuation.match(word)])
|
52
|
|
- with codecs.open('trump.txt', 'r', 'utf-8') as corpus:
|
53
|
|
- text = corpus.read()
|
54
|
|
- sents = nltk.tokenize.sent_tokenize(text)
|
55
|
|
- words = nltk.tokenize.word_tokenize(text)
|
56
|
|
- self.sents.extend(sents)
|
57
|
|
- self.words.extend(words)
|
58
|
|
- self.all_words.extend([word for word in words
|
59
|
|
- if not
|
60
|
|
- self.only_punctuation.match(word)])
|
|
36
|
+ with open('buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
|
|
37
|
+ reader = csv.reader(statuses, delimiter=',')
|
|
38
|
+ for row in reader:
|
|
39
|
+ if 'via buzzfeed ' not in row[1].lower(): # only English
|
|
40
|
+ # split title into a list of words and punctuation
|
|
41
|
+ title = self.spaces_and_punctuation.findall(row[2])
|
|
42
|
+ # spell out digits into ordinal words for syllable counting
|
|
43
|
+ title = [string.capwords(
|
|
44
|
+ self.inflect_engine.number_to_words(int(word)))
|
|
45
|
+ if word.isdigit() else word for word in title]
|
|
46
|
+ self.sents.append(title)
|
|
47
|
+ self.words.extend(title)
|
|
48
|
+ # all_words only contains words, no punctuation
|
|
49
|
+ self.all_words.extend([word for word in title
|
|
50
|
+ if not
|
|
51
|
+ self.only_punctuation.match(word)])
|
|
52
|
+ # with codecs.open('trump.txt', 'r', 'utf-8') as corpus:
|
|
53
|
+ # text = corpus.read()
|
|
54
|
+ # sents = nltk.tokenize.sent_tokenize(text)
|
|
55
|
+ # words = nltk.tokenize.word_tokenize(text)
|
|
56
|
+ # self.sents.extend(sents)
|
|
57
|
+ # self.words.extend(words)
|
|
58
|
+ # self.all_words.extend([word for word in words
|
|
59
|
+ # if not
|
|
60
|
+ # self.only_punctuation.match(word)])
|
61
|
61
|
self.bigrams = list(nltk.bigrams(self.words))
|
62
|
62
|
self.cfd = nltk.ConditionalFreqDist(self.bigrams)
|
63
|
63
|
#self.parser = Parser()
|
|
@@ -142,5 +142,5 @@ if __name__ == '__main__':
|
142
|
142
|
generator = PoemGenerator()
|
143
|
143
|
#generator.generate_poem()
|
144
|
144
|
haiku = generator.generate_haiku()
|
145
|
|
- print haiku
|
|
145
|
+ print(haiku)
|
146
|
146
|
#generator.generate_endless_poem(None)
|