Various scripts for playing around with natural language processing/generation

generate_poem.py 5.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. import codecs
  2. import nltk
  3. import random
  4. import re
  5. import string
  6. import csv
  7. import inflect
  8. from count_syllables import count_syllables
  9. class PoemGenerator(object):
  10. def __init__(self, corpus='buzzfeed_facebook_statues.csv'):
  11. self.only_punctuation = re.compile(r'[^\w\s]+$')
  12. self.spaces_and_punctuation = re.compile(r"[\w']+|[.,!?;]")
  13. self.sents = []
  14. self.words = []
  15. self.all_words = []
  16. self.inflect_engine = inflect.engine()
  17. self.read_corpus(corpus)
  18. self.bigrams = list(nltk.bigrams(self.words))
  19. self.cfd = nltk.ConditionalFreqDist(self.bigrams)
  20. self.history = []
  21. def read_corpus(self, corpus):
  22. """Given filename of corpus, populate words, all_words, and sents."""
  23. if corpus.endswith('.csv'):
  24. if 'buzzfeed_facebook_statuses' in corpus:
  25. return self.read_buzzfeed_corpus(corpus)
  26. else:
  27. return self.read_csv_corpus(corpus)
  28. elif corpus.endswith('.txt'):
  29. return self.read_txt_corpus(corpus)
  30. else:
  31. raise TypeError(('Unrecognized corpus file type: %s.' % corpus) +
  32. '".txt" and ".csv" are only supported')
  33. def read_txt_corpus(self, corpus):
  34. with codecs.open(corpus, 'r', 'utf-8') as corpus_content:
  35. text = corpus_content.read()
  36. sents = nltk.tokenize.sent_tokenize(text)
  37. words = nltk.tokenize.word_tokenize(text)
  38. self.sents.extend(sents)
  39. self.words.extend(words)
  40. self.all_words.extend([word for word in words
  41. if not
  42. self.only_punctuation.match(word)])
  43. def read_csv_corpus(self, corpus):
  44. raise NotImplementedError('Haven\'t implemented generic csv reading')
  45. def read_buzzfeed_corpus(self, corpus):
  46. with open(corpus, newline='', encoding='utf-8') as statuses:
  47. reader = csv.reader(statuses, delimiter=',')
  48. for row in reader:
  49. if 'via buzzfeed ' not in row[1].lower(): # only English
  50. # split title into a list of words and punctuation
  51. title = self.spaces_and_punctuation.findall(row[2])
  52. # spell out digits into ordinal words for syllable counting
  53. title = [string.capwords(
  54. self.inflect_engine.number_to_words(int(word)))
  55. if word.isdigit() else word for word in title]
  56. self.sents.append(title)
  57. self.words.extend(title)
  58. # all_words only contains words, no punctuation
  59. self.all_words.extend([word for word in title
  60. if not
  61. self.only_punctuation.match(word)])
  62. def markov(self, word, n):
  63. if n > 0:
  64. print(word,)
  65. n = n - 1
  66. self.markov(random.choice(self.cfd[word].items())[0], n)
  67. else:
  68. print('')
  69. def generate_text(self):
  70. word = random.choice(self.bigrams)[0]
  71. self.markov(word, 15)
  72. def haiku_line(self, line, current_syllables, next_words,
  73. target_syllables):
  74. if next_words == []:
  75. # this branch failed
  76. return None
  77. else:
  78. word = random.choice(next_words)
  79. new_line = line[:]
  80. new_line.append(word)
  81. new_syllables = sum(map(count_syllables, new_line))
  82. if new_syllables == target_syllables:
  83. return new_line
  84. elif new_syllables > target_syllables:
  85. new_next_words = next_words[:]
  86. new_next_words.remove(word)
  87. return self.haiku_line(line, current_syllables, new_next_words,
  88. target_syllables)
  89. else:
  90. new_next_words = [freq[0] for freq in self.cfd[word].items()
  91. if not self.only_punctuation.match(freq[0])]
  92. branch = self.haiku_line(new_line, new_syllables, new_next_words,
  93. target_syllables)
  94. if branch:
  95. return branch
  96. else:
  97. new_next_words = next_words[:]
  98. new_next_words.remove(word)
  99. return self.haiku_line(line, current_syllables, new_next_words,
  100. target_syllables)
  101. def generate_haiku(self):
  102. haiku = ''
  103. first = self.haiku_line([], 0, self.all_words, 5)
  104. haiku = haiku + ' '.join(first) + '\n'
  105. next_words = [freq[0] for freq in self.cfd[first[-1]].items()
  106. if not self.only_punctuation.match(freq[0])]
  107. if not next_words:
  108. next_words = self.all_words
  109. second = self.haiku_line([], 0, next_words, 7)
  110. haiku = haiku + ' '.join(second) + '\n'
  111. next_words = [freq[0] for freq in self.cfd[second[-1]].items()
  112. if not self.only_punctuation.match(freq[0])]
  113. if not next_words:
  114. next_words = self.all_words
  115. third = self.haiku_line([], 0, next_words, 5)
  116. haiku = haiku + ' '.join(third) + '\n'
  117. return haiku
  118. def generate_endless_poem(self, previous_line):
  119. random_syllables = random.choice(range(1, 26))
  120. if previous_line is None:
  121. next = self.haiku_line([], 0, self.all_words, random_syllables)
  122. print(' '.join(next))
  123. else:
  124. next_words = [freq[0] for freq in self.cfd[previous_line[-1]].items()
  125. if not self.only_punctuation.match(freq[0])]
  126. next = self.haiku_line([], 0, next_words, random_syllables)
  127. print(' '.join(next))
  128. self.generate_endless_poem(next)
  129. if __name__ == '__main__':
  130. generator = PoemGenerator(corpus='buzzfeed_facebook_statuses.csv')
  131. haiku = generator.generate_haiku()
  132. print(haiku)