Various scripts for playing around with natural language processing/generation

generate_random_from_file.py 845B

1234567891011121314151617181920212223242526272829303132
  1. import nltk
  2. import random
  3. import string
  4. import sys
  5. def main(text):
  6. bigrams = list(nltk.bigrams(
  7. [token for token in nltk.word_tokenize(text.decode('utf8'))
  8. if set(token).difference(set(string.punctuation))]))
  9. cfdist = nltk.ConditionalFreqDist(bigrams)
  10. word = random.choice(bigrams)[0]
  11. for i in range(155):
  12. print word,
  13. if i % 3:
  14. top_words = tuple(cfdist[word])
  15. else:
  16. dist = cfdist[word].copy()
  17. top_words = []
  18. for i in range(3):
  19. if dist:
  20. top_words.append(dist.max())
  21. del dist[top_words[-1]]
  22. else:
  23. break
  24. word = random.choice(top_words)
  25. if __name__ == '__main__':
  26. file = sys.argv[1]
  27. with open(file, 'r') as f:
  28. main(f.read())