Various scripts for playing around with natural language processing/generation

generate_poem.py 5.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. import nltk
  2. import random
  3. import re
  4. import string
  5. #import pickle
  6. import csv
  7. import inflect
  8. from count_syllables import count_syllables
  9. #from get_titles import read_titles
  10. #from nltk.corpus import cmudict
  11. #from stat_parser import Parser
  12. class PoemGenerator():
  13. def __init__(self, corpus):
  14. #self.corpus = 'melville-moby_dick.txt'
  15. #self.corpus = read_titles()
  16. #self.sents = corpus.sents(self.corpus)
  17. #self.words = corpus.words(self.corpus)
  18. #self.bigrams = list(nltk.bigrams(self.corpus))
  19. self.only_punctuation = re.compile(r'[^\w\s]+$')
  20. self.spaces_and_punctuation = re.compile(r"[\w']+|[.,!?;]")
  21. #self.all_words = [bigram[0] for bigram in self.bigrams
  22. #if not self.only_punctuation.match(bigram[0])]
  23. #self.cfd = nltk.ConditionalFreqDist(self.bigrams)
  24. #cfds_file = 'cfds.p'
  25. #with open(cfds_file, 'rb') as cfds_file:
  26. #self.cfds = pickle.load(cfds_file)
  27. #self.cfd = self.cfds[0]
  28. #self.all_words = list(self.cfd.keys())
  29. self.sents = []
  30. self.words = []
  31. self.all_words = []
  32. self.inflect_engine = inflect.engine()
  33. with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
  34. reader = csv.reader(statuses, delimiter=',')
  35. for row in reader:
  36. if 'via buzzfeed ' not in row[1].lower(): # only English
  37. # split title into a list of words and punctuation
  38. title = self.spaces_and_punctuation.findall(row[2])
  39. # spell out digits into ordinal words for syllable counting
  40. title = [string.capwords(
  41. self.inflect_engine.number_to_words(int(word)))
  42. if word.isdigit() else word for word in title]
  43. self.sents.append(title)
  44. self.words.extend(title)
  45. # all_words only contains words, no punctuation
  46. self.all_words.extend([word for word in title
  47. if not
  48. self.only_punctuation.match(word)])
  49. self.bigrams = list(nltk.bigrams(self.words))
  50. self.cfd = nltk.ConditionalFreqDist(self.bigrams)
  51. #self.parser = Parser()
  52. self.history = []
  53. def markov(self, word, n):
  54. if n > 0:
  55. print(word,)
  56. n = n - 1
  57. self.markov(random.choice(self.cfd[word].items())[0], n)
  58. else:
  59. print('')
  60. def generate_poem(self):
  61. #sent = random.choice(self.sents)
  62. #parsed = self.parser.parse(' '.join(sent))
  63. word = random.choice(self.bigrams)[0]
  64. self.markov(word, 15)
  65. def haiku_line(self, line, current_syllables, next_words,
  66. target_syllables):
  67. if next_words == []:
  68. # this branch failed
  69. return None
  70. else:
  71. word = random.choice(next_words)
  72. new_line = line[:]
  73. new_line.append(word)
  74. new_syllables = sum(map(count_syllables, new_line))
  75. if new_syllables == target_syllables:
  76. return new_line
  77. elif new_syllables > target_syllables:
  78. new_next_words = next_words[:]
  79. new_next_words.remove(word)
  80. return self.haiku_line(line, current_syllables, new_next_words,
  81. target_syllables)
  82. else:
  83. new_next_words = [freq[0] for freq in self.cfd[word].items()
  84. if not self.only_punctuation.match(freq[0])]
  85. branch = self.haiku_line(new_line, new_syllables, new_next_words,
  86. target_syllables)
  87. if branch:
  88. return branch
  89. else:
  90. new_next_words = next_words[:]
  91. new_next_words.remove(word)
  92. return self.haiku_line(line, current_syllables, new_next_words,
  93. target_syllables)
  94. def generate_haiku(self):
  95. haiku = ''
  96. first = self.haiku_line([], 0, self.all_words, 5)
  97. haiku = haiku + ' '.join(first) + '\n'
  98. next_words = [freq[0] for freq in self.cfd[first[-1]].items()
  99. if not self.only_punctuation.match(freq[0])]
  100. second = self.haiku_line([], 0, next_words, 7)
  101. haiku = haiku + ' '.join(second) + '\n'
  102. next_words = [freq[0] for freq in self.cfd[second[-1]].items()
  103. if not self.only_punctuation.match(freq[0])]
  104. third = self.haiku_line([], 0, next_words, 5)
  105. haiku = haiku + ' '.join(third) + '\n'
  106. return haiku
  107. def generate_endless_poem(self, previous_line):
  108. random_syllables = random.choice(range(1, 26))
  109. if previous_line is None:
  110. next = self.haiku_line([], 0, self.all_words, random_syllables)
  111. print(' '.join(next))
  112. else:
  113. next_words = [freq[0] for freq in self.cfd[previous_line[-1]].items()
  114. if not self.only_punctuation.match(freq[0])]
  115. next = self.haiku_line([], 0, next_words, random_syllables)
  116. print(' '.join(next))
  117. self.generate_endless_poem(next)
  118. if __name__ == '__main__':
  119. generator = PoemGenerator(nltk.corpus.gutenberg)
  120. #generator.generate_poem()
  121. generator.generate_haiku()
  122. #generator.generate_endless_poem(None)