Switch to BuzzFeed title corpus. BuzzFeed Haikus!

2015-08-02 18:01:16 -04:00 · 2015-08-02 18:01:16 -04:00 · e7b0d09b46
commit e7b0d09b46
parent 3142cf0fb3
5 changed files with 25066 additions and 12 deletions
--- a/app.py
+++ b/app.py
@ -0,0 +1,13 @@
 from flask import Flask, render_template
 from generate_poem import PoemGenerator
 app = Flask(__name__)
 generator = PoemGenerator(None)
@app.route("/")
 def home():
    haiku = generator.generate_haiku()
    return render_template('index.html', haiku=haiku)
 if __name__ == '__main__':
    app.run(debug=True)
--- a/buzzfeed_facebook_statuses.csv
+++ b/buzzfeed_facebook_statuses.csv
--- a/count_syllables.py
+++ b/count_syllables.py
@ -0,0 +1,144 @@
 import re
 import string
 from nltk.corpus import cmudict
 cmu = cmudict.dict()
 def count_syllables(word):
    lower_word = word.lower()
    if lower_word in cmu:
        return max([len([y for y in x if y[-1] in string.digits])
                    for x in cmu[lower_word]])
    else:
        return sylco(word)
 # by M. Emre Aydin from http://eayd.in/?p=232
 def sylco(word) :
    word = word.lower()
    # exception_add are words that need extra syllables
    # exception_del are words that need less syllables
    exception_add = ['serious','crucial']
    exception_del = ['fortunately','unfortunately']
    co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']
    co_two = ['coapt','coed','coinci']
    pre_one = ['preach']
    syls = 0 #added syllable number
    disc = 0 #discarded syllable number
    #1) if letters < 3 : return 1
    if len(word) <= 3 :
        syls = 1
        return syls
    #2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
    # if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)
    if word[-2:] == "es" or word[-2:] == "ed" :
        doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word))
        if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 :
            if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" :
                pass
            else :
                disc+=1
    #3) discard trailing "e", except where ending is "le"  
    le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while']
    if word[-1:] == "e" :
        if word[-2:] == "le" and word not in le_except :
            pass
        else :
            disc+=1
    #4) check if consecutive vowels exists, triplets or pairs, count them as one.
    doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word))
    tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word))
    disc+=doubleAndtripple + tripple
    #5) count remaining vowels in word.
    numVowels = len(re.findall(r'[eaoui]',word))
    #6) add one if starts with "mc"
    if word[:2] == "mc" :
        syls+=1
    #7) add one if ends with "y" but is not surrouned by vowel
    if word[-1:] == "y" and word[-2] not in "aeoui" :
        syls +=1
    #8) add one if "y" is surrounded by non-vowels and is not in the last word.
    for i,j in enumerate(word) :
        if j == "y" :
            if (i != 0) and (i != len(word)-1) :
                if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
                    syls+=1
    #9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.
    if word[:3] == "tri" and word[3] in "aeoui" :
        syls+=1
    if word[:2] == "bi" and word[2] in "aeoui" :
        syls+=1
    #10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"
    if word[-3:] == "ian" : 
    #and (word[-4:] != "cian" or word[-4:] != "tian") :
        if word[-4:] == "cian" or word[-4:] == "tian" :
            pass
        else :
            syls+=1
    #11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.
    if word[:2] == "co" and word[2] in 'eaoui' :
        if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two :
            syls+=1
        elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one :
            pass
        else :
            syls+=1
    #12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.
    if word[:3] == "pre" and word[3] in 'eaoui' :
        if word[:6] in pre_one :
            pass
        else :
            syls+=1
    #13) check for "-n't" and cross match with dictionary to add syllable.
    negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"]
    if word[-3:] == "n't" :
        if word in negative :
            syls+=1
        else :
            pass  
    #14) Handling the exceptional words.
    if word in exception_del :
        disc+=1
    if word in exception_add :
        syls+=1    
    # calculate the output
    return numVowels - disc + syls
--- a/generate_poem.py
+++ b/generate_poem.py
@ -1,28 +1,65 @@
 import nltk
 import random
 import re
-from textstat.textstat import textstat
+import string
 #import pickle
 import csv
 import inflect
 from count_syllables import count_syllables
 #from get_titles import read_titles
 #from nltk.corpus import cmudict
 #from stat_parser import Parser
 class PoemGenerator():
    def __init__(self, corpus):
-        #self.sents = corpus.sents('austen-emma.txt')
+        #self.corpus = 'melville-moby_dick.txt'
-        self.bigrams = list(nltk.bigrams(corpus.words('shakespeare-hamlet.txt')))
+        #self.corpus = read_titles()
        #self.sents = corpus.sents(self.corpus)
        #self.words = corpus.words(self.corpus)
        #self.bigrams = list(nltk.bigrams(self.corpus))
        self.only_punctuation = re.compile(r'[^\w\s]+$')
-        self.all_words = [bigram[0] for bigram in self.bigrams
+        self.spaces_and_punctuation = re.compile(r"[\w']+|[.,!?;]")
-                          if not self.only_punctuation.match(bigram[0])]
+        #self.all_words = [bigram[0] for bigram in self.bigrams
                          #if not self.only_punctuation.match(bigram[0])]
        #self.cfd = nltk.ConditionalFreqDist(self.bigrams)
        #cfds_file = 'cfds.p'
        #with open(cfds_file, 'rb') as cfds_file:
            #self.cfds = pickle.load(cfds_file)
        #self.cfd = self.cfds[0]
        #self.all_words = list(self.cfd.keys())
        self.sents = []
        self.words = []
        self.all_words = []
        self.inflect_engine = inflect.engine()
        with open('buzzfeed_facebook_statuses.csv', newline='') as statuses:
            reader = csv.reader(statuses, delimiter=',')
            for row in reader:
                if 'via buzzfeed ' not in row[1].lower():  # only English
                    # split title into a list of words and punctuation
                    title = self.spaces_and_punctuation.findall(row[2])
                    # spell out digits into ordinal words for syllable counting
                    title = [string.capwords(
                             self.inflect_engine.number_to_words(int(word)))
                             if word.isdigit() else word for word in title]
                    self.sents.append(title)
                    self.words.extend(title)
                    # all_words only contains words, no punctuation
                    self.all_words.extend([word for word in title
                                           if not
                                           self.only_punctuation.match(word)])
        self.bigrams = list(nltk.bigrams(self.words))
        self.cfd = nltk.ConditionalFreqDist(self.bigrams)
        #self.parser = Parser()
        self.history = []
    def markov(self, word, n):
        if n > 0:
-            print word,
+            print(word,)
            n = n - 1
            self.markov(random.choice(self.cfd[word].items())[0], n)
        else:
-            print ''
+            print('')
    def generate_poem(self):
        #sent = random.choice(self.sents)
@ -39,7 +76,7 @@ class PoemGenerator():
            word = random.choice(next_words)
        new_line = line[:]
        new_line.append(word)
-        new_syllables = textstat.syllable_count(' '.join(new_line))
+        new_syllables = sum(map(count_syllables, new_line))
        if new_syllables == target_syllables:
            return new_line
        elif new_syllables > target_syllables:
@ -61,19 +98,34 @@ class PoemGenerator():
                                       target_syllables)
    def generate_haiku(self):
        haiku = ''
        first = self.haiku_line([], 0, self.all_words, 5)
-        print ' '.join(first)
+        haiku = haiku + ' '.join(first) + '\n'
        next_words = [freq[0] for freq in self.cfd[first[-1]].items()
                      if not self.only_punctuation.match(freq[0])]
        second = self.haiku_line([], 0, next_words, 7)
-        print ' '.join(second)
+        haiku = haiku + ' '.join(second) + '\n'
-        next_words = [freq[0] for freq in self.cfd[first[-1]].items()
+        next_words = [freq[0] for freq in self.cfd[second[-1]].items()
                      if not self.only_punctuation.match(freq[0])]
        third = self.haiku_line([], 0, next_words, 5)
-        print ' '.join(third)
+        haiku = haiku + ' '.join(third) + '\n'
        return haiku
    def generate_endless_poem(self, previous_line):
        random_syllables = random.choice(range(1, 26))
        if previous_line is None:
            next = self.haiku_line([], 0, self.all_words, random_syllables)
            print(' '.join(next))
        else:
            next_words = [freq[0] for freq in self.cfd[previous_line[-1]].items()
                          if not self.only_punctuation.match(freq[0])]
            next = self.haiku_line([], 0, next_words, random_syllables)
            print(' '.join(next))
        self.generate_endless_poem(next)
 if __name__ == '__main__':
    generator = PoemGenerator(nltk.corpus.gutenberg)
    #generator.generate_poem()
    generator.generate_haiku()
    #generator.generate_endless_poem(None)
--- a/templates/index.html
+++ b/templates/index.html
@ -0,0 +1,26 @@
 <!DOCTYPE html>
 <html>
  <head>
    <title>Buzzfeed Haiku Generator</title>
  </head>
  <body>
    <h1>Buzzfeed Haiku Generator</h1>
    <div class="haiku">
      <pre>{{haiku}}</pre>
      <form action="">
        <input type="submit" value="Generate">
      </form>
    </div>
    <div class="footer">
      <p>
        Made by Tyler Hallada using 
        <a href="https://www.python.org/">Python</a>, 
        <a href="http://www.nltk.org/">NLTK</a>, 
        <a href="http://flask.pocoo.org/">Flask</a>, 
        <a href="https://pypi.python.org/pypi/inflect">inflect</a>, 
        <a href="http://eayd.in/?p=232">sylco</a>, and 
        <a href="https://www.reddit.com/r/datasets/comments/3es1s4/33k_nytimes_and_18k_buzzfeed_facebook_posts_and_a/">BuzzFeed Facebook Posts</a>
      </p>
    </div>
  </body>
 </html>