Switch to BuzzFeed title corpus. BuzzFeed Haikus!
This commit is contained in:
parent
3142cf0fb3
commit
e7b0d09b46
13
app.py
Normal file
13
app.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from flask import Flask, render_template
|
||||||
|
from generate_poem import PoemGenerator
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
generator = PoemGenerator(None)
|
||||||
|
|
||||||
|
@app.route("/")
|
||||||
|
def home():
|
||||||
|
haiku = generator.generate_haiku()
|
||||||
|
return render_template('index.html', haiku=haiku)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(debug=True)
|
24819
buzzfeed_facebook_statuses.csv
Normal file
24819
buzzfeed_facebook_statuses.csv
Normal file
File diff suppressed because it is too large
Load Diff
144
count_syllables.py
Normal file
144
count_syllables.py
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
import re
|
||||||
|
import string
|
||||||
|
from nltk.corpus import cmudict
|
||||||
|
|
||||||
|
|
||||||
|
cmu = cmudict.dict()
|
||||||
|
|
||||||
|
|
||||||
|
def count_syllables(word):
|
||||||
|
lower_word = word.lower()
|
||||||
|
if lower_word in cmu:
|
||||||
|
return max([len([y for y in x if y[-1] in string.digits])
|
||||||
|
for x in cmu[lower_word]])
|
||||||
|
else:
|
||||||
|
return sylco(word)
|
||||||
|
|
||||||
|
|
||||||
|
# by M. Emre Aydin from http://eayd.in/?p=232
|
||||||
|
def sylco(word) :
|
||||||
|
|
||||||
|
word = word.lower()
|
||||||
|
|
||||||
|
# exception_add are words that need extra syllables
|
||||||
|
# exception_del are words that need less syllables
|
||||||
|
|
||||||
|
exception_add = ['serious','crucial']
|
||||||
|
exception_del = ['fortunately','unfortunately']
|
||||||
|
|
||||||
|
co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']
|
||||||
|
co_two = ['coapt','coed','coinci']
|
||||||
|
|
||||||
|
pre_one = ['preach']
|
||||||
|
|
||||||
|
syls = 0 #added syllable number
|
||||||
|
disc = 0 #discarded syllable number
|
||||||
|
|
||||||
|
#1) if letters < 3 : return 1
|
||||||
|
if len(word) <= 3 :
|
||||||
|
syls = 1
|
||||||
|
return syls
|
||||||
|
|
||||||
|
#2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
|
||||||
|
# if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)
|
||||||
|
|
||||||
|
if word[-2:] == "es" or word[-2:] == "ed" :
|
||||||
|
doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word))
|
||||||
|
if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 :
|
||||||
|
if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" :
|
||||||
|
pass
|
||||||
|
else :
|
||||||
|
disc+=1
|
||||||
|
|
||||||
|
#3) discard trailing "e", except where ending is "le"
|
||||||
|
|
||||||
|
le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while']
|
||||||
|
|
||||||
|
if word[-1:] == "e" :
|
||||||
|
if word[-2:] == "le" and word not in le_except :
|
||||||
|
pass
|
||||||
|
|
||||||
|
else :
|
||||||
|
disc+=1
|
||||||
|
|
||||||
|
#4) check if consecutive vowels exists, triplets or pairs, count them as one.
|
||||||
|
|
||||||
|
doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word))
|
||||||
|
tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word))
|
||||||
|
disc+=doubleAndtripple + tripple
|
||||||
|
|
||||||
|
#5) count remaining vowels in word.
|
||||||
|
numVowels = len(re.findall(r'[eaoui]',word))
|
||||||
|
|
||||||
|
#6) add one if starts with "mc"
|
||||||
|
if word[:2] == "mc" :
|
||||||
|
syls+=1
|
||||||
|
|
||||||
|
#7) add one if ends with "y" but is not surrouned by vowel
|
||||||
|
if word[-1:] == "y" and word[-2] not in "aeoui" :
|
||||||
|
syls +=1
|
||||||
|
|
||||||
|
#8) add one if "y" is surrounded by non-vowels and is not in the last word.
|
||||||
|
|
||||||
|
for i,j in enumerate(word) :
|
||||||
|
if j == "y" :
|
||||||
|
if (i != 0) and (i != len(word)-1) :
|
||||||
|
if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
|
||||||
|
syls+=1
|
||||||
|
|
||||||
|
#9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.
|
||||||
|
|
||||||
|
if word[:3] == "tri" and word[3] in "aeoui" :
|
||||||
|
syls+=1
|
||||||
|
|
||||||
|
if word[:2] == "bi" and word[2] in "aeoui" :
|
||||||
|
syls+=1
|
||||||
|
|
||||||
|
#10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"
|
||||||
|
|
||||||
|
if word[-3:] == "ian" :
|
||||||
|
#and (word[-4:] != "cian" or word[-4:] != "tian") :
|
||||||
|
if word[-4:] == "cian" or word[-4:] == "tian" :
|
||||||
|
pass
|
||||||
|
else :
|
||||||
|
syls+=1
|
||||||
|
|
||||||
|
#11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.
|
||||||
|
|
||||||
|
if word[:2] == "co" and word[2] in 'eaoui' :
|
||||||
|
|
||||||
|
if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two :
|
||||||
|
syls+=1
|
||||||
|
elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one :
|
||||||
|
pass
|
||||||
|
else :
|
||||||
|
syls+=1
|
||||||
|
|
||||||
|
#12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.
|
||||||
|
|
||||||
|
if word[:3] == "pre" and word[3] in 'eaoui' :
|
||||||
|
if word[:6] in pre_one :
|
||||||
|
pass
|
||||||
|
else :
|
||||||
|
syls+=1
|
||||||
|
|
||||||
|
#13) check for "-n't" and cross match with dictionary to add syllable.
|
||||||
|
|
||||||
|
negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"]
|
||||||
|
|
||||||
|
if word[-3:] == "n't" :
|
||||||
|
if word in negative :
|
||||||
|
syls+=1
|
||||||
|
else :
|
||||||
|
pass
|
||||||
|
|
||||||
|
#14) Handling the exceptional words.
|
||||||
|
|
||||||
|
if word in exception_del :
|
||||||
|
disc+=1
|
||||||
|
|
||||||
|
if word in exception_add :
|
||||||
|
syls+=1
|
||||||
|
|
||||||
|
# calculate the output
|
||||||
|
return numVowels - disc + syls
|
@ -1,28 +1,65 @@
|
|||||||
import nltk
|
import nltk
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
from textstat.textstat import textstat
|
import string
|
||||||
|
#import pickle
|
||||||
|
import csv
|
||||||
|
import inflect
|
||||||
|
from count_syllables import count_syllables
|
||||||
|
#from get_titles import read_titles
|
||||||
|
#from nltk.corpus import cmudict
|
||||||
#from stat_parser import Parser
|
#from stat_parser import Parser
|
||||||
|
|
||||||
|
|
||||||
class PoemGenerator():
|
class PoemGenerator():
|
||||||
def __init__(self, corpus):
|
def __init__(self, corpus):
|
||||||
#self.sents = corpus.sents('austen-emma.txt')
|
#self.corpus = 'melville-moby_dick.txt'
|
||||||
self.bigrams = list(nltk.bigrams(corpus.words('shakespeare-hamlet.txt')))
|
#self.corpus = read_titles()
|
||||||
|
#self.sents = corpus.sents(self.corpus)
|
||||||
|
#self.words = corpus.words(self.corpus)
|
||||||
|
#self.bigrams = list(nltk.bigrams(self.corpus))
|
||||||
self.only_punctuation = re.compile(r'[^\w\s]+$')
|
self.only_punctuation = re.compile(r'[^\w\s]+$')
|
||||||
self.all_words = [bigram[0] for bigram in self.bigrams
|
self.spaces_and_punctuation = re.compile(r"[\w']+|[.,!?;]")
|
||||||
if not self.only_punctuation.match(bigram[0])]
|
#self.all_words = [bigram[0] for bigram in self.bigrams
|
||||||
|
#if not self.only_punctuation.match(bigram[0])]
|
||||||
|
#self.cfd = nltk.ConditionalFreqDist(self.bigrams)
|
||||||
|
#cfds_file = 'cfds.p'
|
||||||
|
#with open(cfds_file, 'rb') as cfds_file:
|
||||||
|
#self.cfds = pickle.load(cfds_file)
|
||||||
|
#self.cfd = self.cfds[0]
|
||||||
|
#self.all_words = list(self.cfd.keys())
|
||||||
|
self.sents = []
|
||||||
|
self.words = []
|
||||||
|
self.all_words = []
|
||||||
|
self.inflect_engine = inflect.engine()
|
||||||
|
with open('buzzfeed_facebook_statuses.csv', newline='') as statuses:
|
||||||
|
reader = csv.reader(statuses, delimiter=',')
|
||||||
|
for row in reader:
|
||||||
|
if 'via buzzfeed ' not in row[1].lower(): # only English
|
||||||
|
# split title into a list of words and punctuation
|
||||||
|
title = self.spaces_and_punctuation.findall(row[2])
|
||||||
|
# spell out digits into ordinal words for syllable counting
|
||||||
|
title = [string.capwords(
|
||||||
|
self.inflect_engine.number_to_words(int(word)))
|
||||||
|
if word.isdigit() else word for word in title]
|
||||||
|
self.sents.append(title)
|
||||||
|
self.words.extend(title)
|
||||||
|
# all_words only contains words, no punctuation
|
||||||
|
self.all_words.extend([word for word in title
|
||||||
|
if not
|
||||||
|
self.only_punctuation.match(word)])
|
||||||
|
self.bigrams = list(nltk.bigrams(self.words))
|
||||||
self.cfd = nltk.ConditionalFreqDist(self.bigrams)
|
self.cfd = nltk.ConditionalFreqDist(self.bigrams)
|
||||||
#self.parser = Parser()
|
#self.parser = Parser()
|
||||||
self.history = []
|
self.history = []
|
||||||
|
|
||||||
def markov(self, word, n):
|
def markov(self, word, n):
|
||||||
if n > 0:
|
if n > 0:
|
||||||
print word,
|
print(word,)
|
||||||
n = n - 1
|
n = n - 1
|
||||||
self.markov(random.choice(self.cfd[word].items())[0], n)
|
self.markov(random.choice(self.cfd[word].items())[0], n)
|
||||||
else:
|
else:
|
||||||
print ''
|
print('')
|
||||||
|
|
||||||
def generate_poem(self):
|
def generate_poem(self):
|
||||||
#sent = random.choice(self.sents)
|
#sent = random.choice(self.sents)
|
||||||
@ -39,7 +76,7 @@ class PoemGenerator():
|
|||||||
word = random.choice(next_words)
|
word = random.choice(next_words)
|
||||||
new_line = line[:]
|
new_line = line[:]
|
||||||
new_line.append(word)
|
new_line.append(word)
|
||||||
new_syllables = textstat.syllable_count(' '.join(new_line))
|
new_syllables = sum(map(count_syllables, new_line))
|
||||||
if new_syllables == target_syllables:
|
if new_syllables == target_syllables:
|
||||||
return new_line
|
return new_line
|
||||||
elif new_syllables > target_syllables:
|
elif new_syllables > target_syllables:
|
||||||
@ -61,19 +98,34 @@ class PoemGenerator():
|
|||||||
target_syllables)
|
target_syllables)
|
||||||
|
|
||||||
def generate_haiku(self):
|
def generate_haiku(self):
|
||||||
|
haiku = ''
|
||||||
first = self.haiku_line([], 0, self.all_words, 5)
|
first = self.haiku_line([], 0, self.all_words, 5)
|
||||||
print ' '.join(first)
|
haiku = haiku + ' '.join(first) + '\n'
|
||||||
next_words = [freq[0] for freq in self.cfd[first[-1]].items()
|
next_words = [freq[0] for freq in self.cfd[first[-1]].items()
|
||||||
if not self.only_punctuation.match(freq[0])]
|
if not self.only_punctuation.match(freq[0])]
|
||||||
second = self.haiku_line([], 0, next_words, 7)
|
second = self.haiku_line([], 0, next_words, 7)
|
||||||
print ' '.join(second)
|
haiku = haiku + ' '.join(second) + '\n'
|
||||||
next_words = [freq[0] for freq in self.cfd[first[-1]].items()
|
next_words = [freq[0] for freq in self.cfd[second[-1]].items()
|
||||||
if not self.only_punctuation.match(freq[0])]
|
if not self.only_punctuation.match(freq[0])]
|
||||||
third = self.haiku_line([], 0, next_words, 5)
|
third = self.haiku_line([], 0, next_words, 5)
|
||||||
print ' '.join(third)
|
haiku = haiku + ' '.join(third) + '\n'
|
||||||
|
return haiku
|
||||||
|
|
||||||
|
def generate_endless_poem(self, previous_line):
|
||||||
|
random_syllables = random.choice(range(1, 26))
|
||||||
|
if previous_line is None:
|
||||||
|
next = self.haiku_line([], 0, self.all_words, random_syllables)
|
||||||
|
print(' '.join(next))
|
||||||
|
else:
|
||||||
|
next_words = [freq[0] for freq in self.cfd[previous_line[-1]].items()
|
||||||
|
if not self.only_punctuation.match(freq[0])]
|
||||||
|
next = self.haiku_line([], 0, next_words, random_syllables)
|
||||||
|
print(' '.join(next))
|
||||||
|
self.generate_endless_poem(next)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
generator = PoemGenerator(nltk.corpus.gutenberg)
|
generator = PoemGenerator(nltk.corpus.gutenberg)
|
||||||
#generator.generate_poem()
|
#generator.generate_poem()
|
||||||
generator.generate_haiku()
|
generator.generate_haiku()
|
||||||
|
#generator.generate_endless_poem(None)
|
||||||
|
26
templates/index.html
Normal file
26
templates/index.html
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Buzzfeed Haiku Generator</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Buzzfeed Haiku Generator</h1>
|
||||||
|
<div class="haiku">
|
||||||
|
<pre>{{haiku}}</pre>
|
||||||
|
<form action="">
|
||||||
|
<input type="submit" value="Generate">
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
<div class="footer">
|
||||||
|
<p>
|
||||||
|
Made by Tyler Hallada using
|
||||||
|
<a href="https://www.python.org/">Python</a>,
|
||||||
|
<a href="http://www.nltk.org/">NLTK</a>,
|
||||||
|
<a href="http://flask.pocoo.org/">Flask</a>,
|
||||||
|
<a href="https://pypi.python.org/pypi/inflect">inflect</a>,
|
||||||
|
<a href="http://eayd.in/?p=232">sylco</a>, and
|
||||||
|
<a href="https://www.reddit.com/r/datasets/comments/3es1s4/33k_nytimes_and_18k_buzzfeed_facebook_posts_and_a/">BuzzFeed Facebook Posts</a>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
Reference in New Issue
Block a user