nlp/generate_poem.py

import codecs
import nltk
import random
import re
import string
import csv
import inflect
from count_syllables import count_syllables


class PoemGenerator(object):
    def __init__(self, corpus='buzzfeed_facebook_statues.csv'):
        self.only_punctuation = re.compile(r'[^\w\s]+$')
        self.spaces_and_punctuation = re.compile(r"[\w']+|[.,!?;]")
        self.sents = []
        self.words = []
        self.all_words = []
        self.inflect_engine = inflect.engine()
        self.read_corpus(corpus)
        self.bigrams = list(nltk.bigrams(self.words))
        self.cfd = nltk.ConditionalFreqDist(self.bigrams)
        self.history = []

    def read_corpus(self, corpus):
        """Given filename of corpus, populate words, all_words, and sents."""
        if corpus.endswith('.csv'):
            if 'buzzfeed_facebook_statuses' in corpus:
                return self.read_buzzfeed_corpus(corpus)
            else:
                return self.read_csv_corpus(corpus)
        elif corpus.endswith('.txt'):
            return self.read_txt_corpus(corpus)
        else:
            raise TypeError(('Unrecognized corpus file type: %s.' % corpus) +
                            '".txt" and ".csv" are only supported')

    def read_txt_corpus(self, corpus):
        with codecs.open(corpus, 'r', 'utf-8') as corpus_content:
            text = corpus_content.read()
            sents = nltk.tokenize.sent_tokenize(text)
            words = nltk.tokenize.word_tokenize(text)
            self.sents.extend(sents)
            self.words.extend(words)
            self.all_words.extend([word for word in words
                                   if not
                                   self.only_punctuation.match(word)])

    def read_csv_corpus(self, corpus):
        raise NotImplementedError('Haven\'t implemented generic csv reading')

    def read_buzzfeed_corpus(self, corpus):
        with open(corpus, newline='', encoding='utf-8') as statuses:
            reader = csv.reader(statuses, delimiter=',')
            for row in reader:
                if 'via buzzfeed ' not in row[1].lower():  # only English
                    # split title into a list of words and punctuation
                    title = self.spaces_and_punctuation.findall(row[2])
                    # spell out digits into ordinal words for syllable counting
                    title = [string.capwords(
                             self.inflect_engine.number_to_words(int(word)))
                             if word.isdigit() else word for word in title]
                    self.sents.append(title)
                    self.words.extend(title)
                    # all_words only contains words, no punctuation
                    self.all_words.extend([word for word in title
                                           if not
                                           self.only_punctuation.match(word)])

    def markov(self, word, n):
        if n > 0:
            print(word,)
            n = n - 1
            self.markov(random.choice(self.cfd[word].items())[0], n)
        else:
            print('')

    def generate_text(self):
        word = random.choice(self.bigrams)[0]
        self.markov(word, 15)

    def haiku_line(self, line, current_syllables, next_words,
                   target_syllables):
        if next_words == []:
            # this branch failed
            return None
        else:
            word = random.choice(next_words)
        new_line = line[:]
        new_line.append(word)
        new_syllables = sum(map(count_syllables, new_line))
        if new_syllables == target_syllables:
            return new_line
        elif new_syllables > target_syllables:
            new_next_words = next_words[:]
            new_next_words.remove(word)
            return self.haiku_line(line, current_syllables, new_next_words,
                                   target_syllables)
        else:
            new_next_words = [freq[0] for freq in self.cfd[word].items()
                              if not self.only_punctuation.match(freq[0])]
            branch = self.haiku_line(new_line, new_syllables, new_next_words,
                                     target_syllables)
            if branch:
                return branch
            else:
                new_next_words = next_words[:]
                new_next_words.remove(word)
                return self.haiku_line(line, current_syllables, new_next_words,
                                       target_syllables)

    def generate_haiku(self):
        haiku = ''
        first = self.haiku_line([], 0, self.all_words, 5)
        haiku = haiku + ' '.join(first) + '\n'
        next_words = [freq[0] for freq in self.cfd[first[-1]].items()
                      if not self.only_punctuation.match(freq[0])]
        if not next_words:
            next_words = self.all_words
        second = self.haiku_line([], 0, next_words, 7)
        haiku = haiku + ' '.join(second) + '\n'
        next_words = [freq[0] for freq in self.cfd[second[-1]].items()
                      if not self.only_punctuation.match(freq[0])]
        if not next_words:
            next_words = self.all_words
        third = self.haiku_line([], 0, next_words, 5)
        haiku = haiku + ' '.join(third) + '\n'
        return haiku

    def generate_endless_poem(self, previous_line):
        random_syllables = random.choice(range(1, 26))
        if previous_line is None:
            next = self.haiku_line([], 0, self.all_words, random_syllables)
            print(' '.join(next))
        else:
            next_words = [freq[0] for freq in self.cfd[previous_line[-1]].items()
                          if not self.only_punctuation.match(freq[0])]
            next = self.haiku_line([], 0, next_words, random_syllables)
            print(' '.join(next))
        self.generate_endless_poem(next)


if __name__ == '__main__':
    generator = PoemGenerator(corpus='buzzfeed_facebook_statuses.csv')
    haiku = generator.generate_haiku()
    print(haiku)
Allow generating poems from raw text The filename of the raw text is supplied via the first command line argument to the python script call. 2017-03-14 05:03:23 +00:00			`import codecs`
Track existing code 2015-06-07 20:27:59 +00:00			`import nltk`
			`import random`
progress, haiku, and shakespeare 2015-07-14 04:03:05 +00:00			`import re`
Switch to BuzzFeed title corpus. BuzzFeed Haikus! 2015-08-02 22:01:16 +00:00			`import string`
			`import csv`
			`import inflect`
			`from count_syllables import count_syllables`
Track existing code 2015-06-07 20:27:59 +00:00

Clean up generate_poem.py a little 2017-07-11 02:10:02 +00:00			`class PoemGenerator(object):`
			`def __init__(self, corpus='buzzfeed_facebook_statues.csv'):`
progress, haiku, and shakespeare 2015-07-14 04:03:05 +00:00			`self.only_punctuation = re.compile(r'[^\w\s]+$')`
Switch to BuzzFeed title corpus. BuzzFeed Haikus! 2015-08-02 22:01:16 +00:00			`self.spaces_and_punctuation = re.compile(r"[\w']+\|[.,!?;]")`
			`self.sents = []`
			`self.words = []`
			`self.all_words = []`
			`self.inflect_engine = inflect.engine()`
Clean up generate_poem.py a little 2017-07-11 02:10:02 +00:00			`self.read_corpus(corpus)`
			`self.bigrams = list(nltk.bigrams(self.words))`
			`self.cfd = nltk.ConditionalFreqDist(self.bigrams)`
			`self.history = []`

			`def read_corpus(self, corpus):`
			`"""Given filename of corpus, populate words, all_words, and sents."""`
			`if corpus.endswith('.csv'):`
			`if 'buzzfeed_facebook_statuses' in corpus:`
			`return self.read_buzzfeed_corpus(corpus)`
			`else:`
			`return self.read_csv_corpus(corpus)`
			`elif corpus.endswith('.txt'):`
			`return self.read_txt_corpus(corpus)`
			`else:`
			`raise TypeError(('Unrecognized corpus file type: %s.' % corpus) +`
			`'".txt" and ".csv" are only supported')`

			`def read_txt_corpus(self, corpus):`
			`with codecs.open(corpus, 'r', 'utf-8') as corpus_content:`
			`text = corpus_content.read()`
			`sents = nltk.tokenize.sent_tokenize(text)`
			`words = nltk.tokenize.word_tokenize(text)`
			`self.sents.extend(sents)`
			`self.words.extend(words)`
			`self.all_words.extend([word for word in words`
			`if not`
			`self.only_punctuation.match(word)])`

			`def read_csv_corpus(self, corpus):`
			`raise NotImplementedError('Haven\'t implemented generic csv reading')`

			`def read_buzzfeed_corpus(self, corpus):`
			`with open(corpus, newline='', encoding='utf-8') as statuses:`
Getting app to run on new server under gunicorn 2017-07-10 19:47:10 +00:00			`reader = csv.reader(statuses, delimiter=',')`
			`for row in reader:`
			`if 'via buzzfeed ' not in row[1].lower(): # only English`
			`# split title into a list of words and punctuation`
			`title = self.spaces_and_punctuation.findall(row[2])`
			`# spell out digits into ordinal words for syllable counting`
			`title = [string.capwords(`
			`self.inflect_engine.number_to_words(int(word)))`
			`if word.isdigit() else word for word in title]`
			`self.sents.append(title)`
			`self.words.extend(title)`
			`# all_words only contains words, no punctuation`
			`self.all_words.extend([word for word in title`
			`if not`
			`self.only_punctuation.match(word)])`
Track existing code 2015-06-07 20:27:59 +00:00
progress, haiku, and shakespeare 2015-07-14 04:03:05 +00:00			`def markov(self, word, n):`
			`if n > 0:`
Switch to BuzzFeed title corpus. BuzzFeed Haikus! 2015-08-02 22:01:16 +00:00			`print(word,)`
progress, haiku, and shakespeare 2015-07-14 04:03:05 +00:00			`n = n - 1`
			`self.markov(random.choice(self.cfd[word].items())[0], n)`
			`else:`
Switch to BuzzFeed title corpus. BuzzFeed Haikus! 2015-08-02 22:01:16 +00:00			`print('')`
progress, haiku, and shakespeare 2015-07-14 04:03:05 +00:00
Add jupyter notebook presentation 2017-04-10 19:49:24 +00:00			`def generate_text(self):`
Track existing code 2015-06-07 20:27:59 +00:00			`word = random.choice(self.bigrams)[0]`
progress, haiku, and shakespeare 2015-07-14 04:03:05 +00:00			`self.markov(word, 15)`

			`def haiku_line(self, line, current_syllables, next_words,`
			`target_syllables):`
			`if next_words == []:`
			`# this branch failed`
			`return None`
			`else:`
			`word = random.choice(next_words)`
			`new_line = line[:]`
			`new_line.append(word)`
Switch to BuzzFeed title corpus. BuzzFeed Haikus! 2015-08-02 22:01:16 +00:00			`new_syllables = sum(map(count_syllables, new_line))`
progress, haiku, and shakespeare 2015-07-14 04:03:05 +00:00			`if new_syllables == target_syllables:`
			`return new_line`
			`elif new_syllables > target_syllables:`
			`new_next_words = next_words[:]`
			`new_next_words.remove(word)`
			`return self.haiku_line(line, current_syllables, new_next_words,`
			`target_syllables)`
			`else:`
			`new_next_words = [freq[0] for freq in self.cfd[word].items()`
			`if not self.only_punctuation.match(freq[0])]`
			`branch = self.haiku_line(new_line, new_syllables, new_next_words,`
			`target_syllables)`
			`if branch:`
			`return branch`
			`else:`
			`new_next_words = next_words[:]`
			`new_next_words.remove(word)`
			`return self.haiku_line(line, current_syllables, new_next_words,`
			`target_syllables)`

			`def generate_haiku(self):`
Switch to BuzzFeed title corpus. BuzzFeed Haikus! 2015-08-02 22:01:16 +00:00			`haiku = ''`
progress, haiku, and shakespeare 2015-07-14 04:03:05 +00:00			`first = self.haiku_line([], 0, self.all_words, 5)`
Switch to BuzzFeed title corpus. BuzzFeed Haikus! 2015-08-02 22:01:16 +00:00			`haiku = haiku + ' '.join(first) + '\n'`
progress, haiku, and shakespeare 2015-07-14 04:03:05 +00:00			`next_words = [freq[0] for freq in self.cfd[first[-1]].items()`
			`if not self.only_punctuation.match(freq[0])]`
Try and fix occasional exception 2016-08-14 21:43:03 +00:00			`if not next_words:`
			`next_words = self.all_words`
progress, haiku, and shakespeare 2015-07-14 04:03:05 +00:00			`second = self.haiku_line([], 0, next_words, 7)`
Switch to BuzzFeed title corpus. BuzzFeed Haikus! 2015-08-02 22:01:16 +00:00			`haiku = haiku + ' '.join(second) + '\n'`
			`next_words = [freq[0] for freq in self.cfd[second[-1]].items()`
progress, haiku, and shakespeare 2015-07-14 04:03:05 +00:00			`if not self.only_punctuation.match(freq[0])]`
Try and fix occasional exception 2016-08-14 21:43:03 +00:00			`if not next_words:`
			`next_words = self.all_words`
progress, haiku, and shakespeare 2015-07-14 04:03:05 +00:00			`third = self.haiku_line([], 0, next_words, 5)`
Switch to BuzzFeed title corpus. BuzzFeed Haikus! 2015-08-02 22:01:16 +00:00			`haiku = haiku + ' '.join(third) + '\n'`
			`return haiku`

			`def generate_endless_poem(self, previous_line):`
			`random_syllables = random.choice(range(1, 26))`
			`if previous_line is None:`
			`next = self.haiku_line([], 0, self.all_words, random_syllables)`
			`print(' '.join(next))`
			`else:`
			`next_words = [freq[0] for freq in self.cfd[previous_line[-1]].items()`
			`if not self.only_punctuation.match(freq[0])]`
			`next = self.haiku_line([], 0, next_words, random_syllables)`
			`print(' '.join(next))`
			`self.generate_endless_poem(next)`
Track existing code 2015-06-07 20:27:59 +00:00

			`if __name__ == '__main__':`
Clean up generate_poem.py a little 2017-07-11 02:10:02 +00:00			`generator = PoemGenerator(corpus='buzzfeed_facebook_statuses.csv')`
Allow generating poems from raw text The filename of the raw text is supplied via the first command line argument to the python script call. 2017-03-14 05:03:23 +00:00			`haiku = generator.generate_haiku()`
Getting app to run on new server under gunicorn 2017-07-10 19:47:10 +00:00			`print(haiku)`