Various scripts for playing around with natural language processing/generation

markov_matrix.py 3.4KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. """
  2. My idea here is to encode the entire corpus as one giant two-dimensional numpy array of floats where each row is a
  3. condition word and each column in that row is every other word in the corpus and the probability that the word follows
  4. the conditional word.
  5. This was an interesting idea, but ultimately not that useful since the resulting numpy array is significantly larger
  6. than just storing the CFD in a python dictionary. There might be some crazy linear algebra I could run to compress this
  7. array to make it less sparse. But, I would need to use the same N words for all corpora and I think that the resulting
  8. compressed arrays would only be really useful for comparing with each other to find things like "closeness" between two
  9. corpora as defined by the probabilities that some words follow other words in the text. Also, using the same N words
  10. across all corpora is less awesome because you will miss out on the unique words (names, proper nouns, etc.) present in
  11. only some corpora.
  12. """
  13. import codecs
  14. import sys
  15. from collections import OrderedDict
  16. from itertools import islice
  17. import nltk # TODO: write/import a tokenizer so I don't need to import this
  18. import numpy as np
  19. BEGIN_TOKEN = '__BEGIN__'
  20. END_TOKEN = '__END__'
  21. def load_text(filename):
  22. """Return all text from UTF-8 encoded file on disk."""
  23. with codecs.open(filename, encoding='utf-8') as corpus:
  24. return corpus.read()
  25. def build_matrix(text, word_dict, state_size=1):
  26. matrix = np.zeros((len(word_dict),) * 2, dtype=np.int32)
  27. sentences = nltk.sent_tokenize(text)
  28. for sent in sentences:
  29. sent = [BEGIN_TOKEN] + nltk.word_tokenize(sent) + [END_TOKEN]
  30. for i in range(len(sent) - (state_size + 1)):
  31. condition = ' '.join(sent[i:(i + state_size)])
  32. sample = sent[(i + state_size)]
  33. condition_index = word_dict[condition]
  34. sample_index = word_dict[sample]
  35. matrix[condition_index][sample_index] += 1
  36. return matrix
  37. def unique_words(tokenized_text, case_insensitive=False):
  38. """Returns an OrderedDict of all unique words in the given text."""
  39. word_set = set()
  40. # TODO: not great that I'm doing tokenization and looping over them twice...
  41. sentences = nltk.sent_tokenize(text)
  42. for sent in sentences:
  43. sent = nltk.word_tokenize(sent)
  44. for word in sent:
  45. if case_insensitive:
  46. word = word.lower()
  47. word_set.add(word)
  48. word_set.update(set([BEGIN_TOKEN, END_TOKEN]))
  49. return OrderedDict((word, i) for i, word in enumerate(sorted(word_set)))
  50. def generate_sentence(matrix, word_dict):
  51. sent = []
  52. counter = 0
  53. choices = np.arange(len(word_dict))
  54. # Is it bad to create a new array in the inner loop down there?
  55. # probs = np.zeros((len(word_dict),), dtype=np.float)
  56. state = word_dict[BEGIN_TOKEN]
  57. # TODO: it's not finding the end token
  58. while state != word_dict[END_TOKEN] and counter != 30:
  59. probs = matrix[state].astype(np.float)
  60. probs /= probs.sum()
  61. state = np.random.choice(choices, p=probs)
  62. if state != word_dict[END_TOKEN]:
  63. sent.append(next(islice(word_dict.items(), int(state), None))[0])
  64. counter += 1
  65. return ' '.join(sent)
  66. if __name__ == '__main__':
  67. text = load_text(sys.argv[1])
  68. word_dict = unique_words(text)
  69. matrix = build_matrix(text, word_dict)
  70. print(generate_sentence(matrix, word_dict))