|
@@ -1,28 +1,65 @@
|
1
|
1
|
import nltk
|
2
|
2
|
import random
|
3
|
3
|
import re
|
4
|
|
-from textstat.textstat import textstat
|
|
4
|
+import string
|
|
5
|
+#import pickle
|
|
6
|
+import csv
|
|
7
|
+import inflect
|
|
8
|
+from count_syllables import count_syllables
|
|
9
|
+#from get_titles import read_titles
|
|
10
|
+#from nltk.corpus import cmudict
|
5
|
11
|
#from stat_parser import Parser
|
6
|
12
|
|
7
|
13
|
|
8
|
14
|
class PoemGenerator():
|
9
|
15
|
def __init__(self, corpus):
|
10
|
|
- #self.sents = corpus.sents('austen-emma.txt')
|
11
|
|
- self.bigrams = list(nltk.bigrams(corpus.words('shakespeare-hamlet.txt')))
|
|
16
|
+ #self.corpus = 'melville-moby_dick.txt'
|
|
17
|
+ #self.corpus = read_titles()
|
|
18
|
+ #self.sents = corpus.sents(self.corpus)
|
|
19
|
+ #self.words = corpus.words(self.corpus)
|
|
20
|
+ #self.bigrams = list(nltk.bigrams(self.corpus))
|
12
|
21
|
self.only_punctuation = re.compile(r'[^\w\s]+$')
|
13
|
|
- self.all_words = [bigram[0] for bigram in self.bigrams
|
14
|
|
- if not self.only_punctuation.match(bigram[0])]
|
|
22
|
+ self.spaces_and_punctuation = re.compile(r"[\w']+|[.,!?;]")
|
|
23
|
+ #self.all_words = [bigram[0] for bigram in self.bigrams
|
|
24
|
+ #if not self.only_punctuation.match(bigram[0])]
|
|
25
|
+ #self.cfd = nltk.ConditionalFreqDist(self.bigrams)
|
|
26
|
+ #cfds_file = 'cfds.p'
|
|
27
|
+ #with open(cfds_file, 'rb') as cfds_file:
|
|
28
|
+ #self.cfds = pickle.load(cfds_file)
|
|
29
|
+ #self.cfd = self.cfds[0]
|
|
30
|
+ #self.all_words = list(self.cfd.keys())
|
|
31
|
+ self.sents = []
|
|
32
|
+ self.words = []
|
|
33
|
+ self.all_words = []
|
|
34
|
+ self.inflect_engine = inflect.engine()
|
|
35
|
+ with open('buzzfeed_facebook_statuses.csv', newline='') as statuses:
|
|
36
|
+ reader = csv.reader(statuses, delimiter=',')
|
|
37
|
+ for row in reader:
|
|
38
|
+ if 'via buzzfeed ' not in row[1].lower(): # only English
|
|
39
|
+ # split title into a list of words and punctuation
|
|
40
|
+ title = self.spaces_and_punctuation.findall(row[2])
|
|
41
|
+ # spell out digits into ordinal words for syllable counting
|
|
42
|
+ title = [string.capwords(
|
|
43
|
+ self.inflect_engine.number_to_words(int(word)))
|
|
44
|
+ if word.isdigit() else word for word in title]
|
|
45
|
+ self.sents.append(title)
|
|
46
|
+ self.words.extend(title)
|
|
47
|
+ # all_words only contains words, no punctuation
|
|
48
|
+ self.all_words.extend([word for word in title
|
|
49
|
+ if not
|
|
50
|
+ self.only_punctuation.match(word)])
|
|
51
|
+ self.bigrams = list(nltk.bigrams(self.words))
|
15
|
52
|
self.cfd = nltk.ConditionalFreqDist(self.bigrams)
|
16
|
53
|
#self.parser = Parser()
|
17
|
54
|
self.history = []
|
18
|
55
|
|
19
|
56
|
def markov(self, word, n):
|
20
|
57
|
if n > 0:
|
21
|
|
- print word,
|
|
58
|
+ print(word,)
|
22
|
59
|
n = n - 1
|
23
|
60
|
self.markov(random.choice(self.cfd[word].items())[0], n)
|
24
|
61
|
else:
|
25
|
|
- print ''
|
|
62
|
+ print('')
|
26
|
63
|
|
27
|
64
|
def generate_poem(self):
|
28
|
65
|
#sent = random.choice(self.sents)
|
|
@@ -39,7 +76,7 @@ class PoemGenerator():
|
39
|
76
|
word = random.choice(next_words)
|
40
|
77
|
new_line = line[:]
|
41
|
78
|
new_line.append(word)
|
42
|
|
- new_syllables = textstat.syllable_count(' '.join(new_line))
|
|
79
|
+ new_syllables = sum(map(count_syllables, new_line))
|
43
|
80
|
if new_syllables == target_syllables:
|
44
|
81
|
return new_line
|
45
|
82
|
elif new_syllables > target_syllables:
|
|
@@ -61,19 +98,34 @@ class PoemGenerator():
|
61
|
98
|
target_syllables)
|
62
|
99
|
|
63
|
100
|
def generate_haiku(self):
|
|
101
|
+ haiku = ''
|
64
|
102
|
first = self.haiku_line([], 0, self.all_words, 5)
|
65
|
|
- print ' '.join(first)
|
|
103
|
+ haiku = haiku + ' '.join(first) + '\n'
|
66
|
104
|
next_words = [freq[0] for freq in self.cfd[first[-1]].items()
|
67
|
105
|
if not self.only_punctuation.match(freq[0])]
|
68
|
106
|
second = self.haiku_line([], 0, next_words, 7)
|
69
|
|
- print ' '.join(second)
|
70
|
|
- next_words = [freq[0] for freq in self.cfd[first[-1]].items()
|
|
107
|
+ haiku = haiku + ' '.join(second) + '\n'
|
|
108
|
+ next_words = [freq[0] for freq in self.cfd[second[-1]].items()
|
71
|
109
|
if not self.only_punctuation.match(freq[0])]
|
72
|
110
|
third = self.haiku_line([], 0, next_words, 5)
|
73
|
|
- print ' '.join(third)
|
|
111
|
+ haiku = haiku + ' '.join(third) + '\n'
|
|
112
|
+ return haiku
|
|
113
|
+
|
|
114
|
+ def generate_endless_poem(self, previous_line):
|
|
115
|
+ random_syllables = random.choice(range(1, 26))
|
|
116
|
+ if previous_line is None:
|
|
117
|
+ next = self.haiku_line([], 0, self.all_words, random_syllables)
|
|
118
|
+ print(' '.join(next))
|
|
119
|
+ else:
|
|
120
|
+ next_words = [freq[0] for freq in self.cfd[previous_line[-1]].items()
|
|
121
|
+ if not self.only_punctuation.match(freq[0])]
|
|
122
|
+ next = self.haiku_line([], 0, next_words, random_syllables)
|
|
123
|
+ print(' '.join(next))
|
|
124
|
+ self.generate_endless_poem(next)
|
74
|
125
|
|
75
|
126
|
|
76
|
127
|
if __name__ == '__main__':
|
77
|
128
|
generator = PoemGenerator(nltk.corpus.gutenberg)
|
78
|
129
|
#generator.generate_poem()
|
79
|
130
|
generator.generate_haiku()
|
|
131
|
+ #generator.generate_endless_poem(None)
|