Browse Source

Switch to BuzzFeed title corpus. BuzzFeed Haikus!

Tyler Hallada 8 years ago
parent
commit
e7b0d09b46
5 changed files with 25066 additions and 12 deletions
  1. 13 0
      app.py
  2. 24819 0
      buzzfeed_facebook_statuses.csv
  3. 144 0
      count_syllables.py
  4. 64 12
      generate_poem.py
  5. 26 0
      templates/index.html

+ 13 - 0
app.py

@@ -0,0 +1,13 @@
1
+from flask import Flask, render_template
2
+from generate_poem import PoemGenerator
3
+app = Flask(__name__)
4
+
5
+generator = PoemGenerator(None)
6
+
7
+@app.route("/")
8
+def home():
9
+    haiku = generator.generate_haiku()
10
+    return render_template('index.html', haiku=haiku)
11
+
12
+if __name__ == '__main__':
13
+    app.run(debug=True)

File diff suppressed because it is too large
+ 24819 - 0
buzzfeed_facebook_statuses.csv


+ 144 - 0
count_syllables.py

@@ -0,0 +1,144 @@
1
+import re
2
+import string
3
+from nltk.corpus import cmudict
4
+
5
+
6
+cmu = cmudict.dict()
7
+
8
+
9
+def count_syllables(word):
10
+    lower_word = word.lower()
11
+    if lower_word in cmu:
12
+        return max([len([y for y in x if y[-1] in string.digits])
13
+                    for x in cmu[lower_word]])
14
+    else:
15
+        return sylco(word)
16
+ 
17
+
18
+ # by M. Emre Aydin from http://eayd.in/?p=232
19
+def sylco(word) :
20
+ 
21
+    word = word.lower()
22
+ 
23
+    # exception_add are words that need extra syllables
24
+    # exception_del are words that need less syllables
25
+ 
26
+    exception_add = ['serious','crucial']
27
+    exception_del = ['fortunately','unfortunately']
28
+ 
29
+    co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']
30
+    co_two = ['coapt','coed','coinci']
31
+ 
32
+    pre_one = ['preach']
33
+ 
34
+    syls = 0 #added syllable number
35
+    disc = 0 #discarded syllable number
36
+ 
37
+    #1) if letters < 3 : return 1
38
+    if len(word) <= 3 :
39
+        syls = 1
40
+        return syls
41
+ 
42
+    #2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
43
+    # if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)
44
+ 
45
+    if word[-2:] == "es" or word[-2:] == "ed" :
46
+        doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word))
47
+        if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 :
48
+            if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" :
49
+                pass
50
+            else :
51
+                disc+=1
52
+ 
53
+    #3) discard trailing "e", except where ending is "le"  
54
+ 
55
+    le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while']
56
+ 
57
+    if word[-1:] == "e" :
58
+        if word[-2:] == "le" and word not in le_except :
59
+            pass
60
+ 
61
+        else :
62
+            disc+=1
63
+ 
64
+    #4) check if consecutive vowels exists, triplets or pairs, count them as one.
65
+ 
66
+    doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word))
67
+    tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word))
68
+    disc+=doubleAndtripple + tripple
69
+ 
70
+    #5) count remaining vowels in word.
71
+    numVowels = len(re.findall(r'[eaoui]',word))
72
+ 
73
+    #6) add one if starts with "mc"
74
+    if word[:2] == "mc" :
75
+        syls+=1
76
+ 
77
+    #7) add one if ends with "y" but is not surrouned by vowel
78
+    if word[-1:] == "y" and word[-2] not in "aeoui" :
79
+        syls +=1
80
+ 
81
+    #8) add one if "y" is surrounded by non-vowels and is not in the last word.
82
+ 
83
+    for i,j in enumerate(word) :
84
+        if j == "y" :
85
+            if (i != 0) and (i != len(word)-1) :
86
+                if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
87
+                    syls+=1
88
+ 
89
+    #9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.
90
+ 
91
+    if word[:3] == "tri" and word[3] in "aeoui" :
92
+        syls+=1
93
+ 
94
+    if word[:2] == "bi" and word[2] in "aeoui" :
95
+        syls+=1
96
+ 
97
+    #10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"
98
+ 
99
+    if word[-3:] == "ian" : 
100
+    #and (word[-4:] != "cian" or word[-4:] != "tian") :
101
+        if word[-4:] == "cian" or word[-4:] == "tian" :
102
+            pass
103
+        else :
104
+            syls+=1
105
+ 
106
+    #11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.
107
+ 
108
+    if word[:2] == "co" and word[2] in 'eaoui' :
109
+ 
110
+        if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two :
111
+            syls+=1
112
+        elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one :
113
+            pass
114
+        else :
115
+            syls+=1
116
+ 
117
+    #12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.
118
+ 
119
+    if word[:3] == "pre" and word[3] in 'eaoui' :
120
+        if word[:6] in pre_one :
121
+            pass
122
+        else :
123
+            syls+=1
124
+ 
125
+    #13) check for "-n't" and cross match with dictionary to add syllable.
126
+ 
127
+    negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"]
128
+ 
129
+    if word[-3:] == "n't" :
130
+        if word in negative :
131
+            syls+=1
132
+        else :
133
+            pass  
134
+ 
135
+    #14) Handling the exceptional words.
136
+ 
137
+    if word in exception_del :
138
+        disc+=1
139
+ 
140
+    if word in exception_add :
141
+        syls+=1    
142
+ 
143
+    # calculate the output
144
+    return numVowels - disc + syls

+ 64 - 12
generate_poem.py

@@ -1,28 +1,65 @@
1 1
 import nltk
2 2
 import random
3 3
 import re
4
-from textstat.textstat import textstat
4
+import string
5
+#import pickle
6
+import csv
7
+import inflect
8
+from count_syllables import count_syllables
9
+#from get_titles import read_titles
10
+#from nltk.corpus import cmudict
5 11
 #from stat_parser import Parser
6 12
 
7 13
 
8 14
 class PoemGenerator():
9 15
     def __init__(self, corpus):
10
-        #self.sents = corpus.sents('austen-emma.txt')
11
-        self.bigrams = list(nltk.bigrams(corpus.words('shakespeare-hamlet.txt')))
16
+        #self.corpus = 'melville-moby_dick.txt'
17
+        #self.corpus = read_titles()
18
+        #self.sents = corpus.sents(self.corpus)
19
+        #self.words = corpus.words(self.corpus)
20
+        #self.bigrams = list(nltk.bigrams(self.corpus))
12 21
         self.only_punctuation = re.compile(r'[^\w\s]+$')
13
-        self.all_words = [bigram[0] for bigram in self.bigrams
14
-                          if not self.only_punctuation.match(bigram[0])]
22
+        self.spaces_and_punctuation = re.compile(r"[\w']+|[.,!?;]")
23
+        #self.all_words = [bigram[0] for bigram in self.bigrams
24
+                          #if not self.only_punctuation.match(bigram[0])]
25
+        #self.cfd = nltk.ConditionalFreqDist(self.bigrams)
26
+        #cfds_file = 'cfds.p'
27
+        #with open(cfds_file, 'rb') as cfds_file:
28
+            #self.cfds = pickle.load(cfds_file)
29
+        #self.cfd = self.cfds[0]
30
+        #self.all_words = list(self.cfd.keys())
31
+        self.sents = []
32
+        self.words = []
33
+        self.all_words = []
34
+        self.inflect_engine = inflect.engine()
35
+        with open('buzzfeed_facebook_statuses.csv', newline='') as statuses:
36
+            reader = csv.reader(statuses, delimiter=',')
37
+            for row in reader:
38
+                if 'via buzzfeed ' not in row[1].lower():  # only English
39
+                    # split title into a list of words and punctuation
40
+                    title = self.spaces_and_punctuation.findall(row[2])
41
+                    # spell out digits into ordinal words for syllable counting
42
+                    title = [string.capwords(
43
+                             self.inflect_engine.number_to_words(int(word)))
44
+                             if word.isdigit() else word for word in title]
45
+                    self.sents.append(title)
46
+                    self.words.extend(title)
47
+                    # all_words only contains words, no punctuation
48
+                    self.all_words.extend([word for word in title
49
+                                           if not
50
+                                           self.only_punctuation.match(word)])
51
+        self.bigrams = list(nltk.bigrams(self.words))
15 52
         self.cfd = nltk.ConditionalFreqDist(self.bigrams)
16 53
         #self.parser = Parser()
17 54
         self.history = []
18 55
 
19 56
     def markov(self, word, n):
20 57
         if n > 0:
21
-            print word,
58
+            print(word,)
22 59
             n = n - 1
23 60
             self.markov(random.choice(self.cfd[word].items())[0], n)
24 61
         else:
25
-            print ''
62
+            print('')
26 63
 
27 64
     def generate_poem(self):
28 65
         #sent = random.choice(self.sents)
@@ -39,7 +76,7 @@ class PoemGenerator():
39 76
             word = random.choice(next_words)
40 77
         new_line = line[:]
41 78
         new_line.append(word)
42
-        new_syllables = textstat.syllable_count(' '.join(new_line))
79
+        new_syllables = sum(map(count_syllables, new_line))
43 80
         if new_syllables == target_syllables:
44 81
             return new_line
45 82
         elif new_syllables > target_syllables:
@@ -61,19 +98,34 @@ class PoemGenerator():
61 98
                                        target_syllables)
62 99
 
63 100
     def generate_haiku(self):
101
+        haiku = ''
64 102
         first = self.haiku_line([], 0, self.all_words, 5)
65
-        print ' '.join(first)
103
+        haiku = haiku + ' '.join(first) + '\n'
66 104
         next_words = [freq[0] for freq in self.cfd[first[-1]].items()
67 105
                       if not self.only_punctuation.match(freq[0])]
68 106
         second = self.haiku_line([], 0, next_words, 7)
69
-        print ' '.join(second)
70
-        next_words = [freq[0] for freq in self.cfd[first[-1]].items()
107
+        haiku = haiku + ' '.join(second) + '\n'
108
+        next_words = [freq[0] for freq in self.cfd[second[-1]].items()
71 109
                       if not self.only_punctuation.match(freq[0])]
72 110
         third = self.haiku_line([], 0, next_words, 5)
73
-        print ' '.join(third)
111
+        haiku = haiku + ' '.join(third) + '\n'
112
+        return haiku
113
+
114
+    def generate_endless_poem(self, previous_line):
115
+        random_syllables = random.choice(range(1, 26))
116
+        if previous_line is None:
117
+            next = self.haiku_line([], 0, self.all_words, random_syllables)
118
+            print(' '.join(next))
119
+        else:
120
+            next_words = [freq[0] for freq in self.cfd[previous_line[-1]].items()
121
+                          if not self.only_punctuation.match(freq[0])]
122
+            next = self.haiku_line([], 0, next_words, random_syllables)
123
+            print(' '.join(next))
124
+        self.generate_endless_poem(next)
74 125
 
75 126
 
76 127
 if __name__ == '__main__':
77 128
     generator = PoemGenerator(nltk.corpus.gutenberg)
78 129
     #generator.generate_poem()
79 130
     generator.generate_haiku()
131
+    #generator.generate_endless_poem(None)

+ 26 - 0
templates/index.html

@@ -0,0 +1,26 @@
1
+<!DOCTYPE html>
2
+<html>
3
+  <head>
4
+    <title>Buzzfeed Haiku Generator</title>
5
+  </head>
6
+  <body>
7
+    <h1>Buzzfeed Haiku Generator</h1>
8
+    <div class="haiku">
9
+      <pre>{{haiku}}</pre>
10
+      <form action="">
11
+        <input type="submit" value="Generate">
12
+      </form>
13
+    </div>
14
+    <div class="footer">
15
+      <p>
16
+        Made by Tyler Hallada using 
17
+        <a href="https://www.python.org/">Python</a>, 
18
+        <a href="http://www.nltk.org/">NLTK</a>, 
19
+        <a href="http://flask.pocoo.org/">Flask</a>, 
20
+        <a href="https://pypi.python.org/pypi/inflect">inflect</a>, 
21
+        <a href="http://eayd.in/?p=232">sylco</a>, and 
22
+        <a href="https://www.reddit.com/r/datasets/comments/3es1s4/33k_nytimes_and_18k_buzzfeed_facebook_posts_and_a/">BuzzFeed Facebook Posts</a>
23
+      </p>
24
+    </div>
25
+  </body>
26
+</html>