From 35792b6261780fe8d5b1cb08bfaa623db08ce468 Mon Sep 17 00:00:00 2001
From: Tyler Hallada <tyler@hallada.net>
Date: Fri, 13 Oct 2017 01:04:23 -0400
Subject: [PATCH] Rudimentary impl. of markov chain in a numpy array

---
 markov_matrix.py | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 markov_matrix.py

diff --git a/markov_matrix.py b/markov_matrix.py
new file mode 100644
index 0000000..57f7ced
--- /dev/null
+++ b/markov_matrix.py
@@ -0,0 +1,76 @@
+"""
+My idea here is to encode the entire corpus as one giant two-dimensional numpy array of floats where each row is a
+condition word and each column in that row is every other word in the corpus and the probability that the word follows
+the conditional word.
+"""
+from collections import OrderedDict
+from itertools import islice
+
+import codecs
+import nltk  # TODO: write/import a tokenizer so I don't need to import this
+import numpy as np
+import sys
+
+
+BEGIN_TOKEN = '__BEGIN__'
+END_TOKEN = '__END__'
+
+
+def load_text(filename):
+    """Return all text from UTF-8 encoded file on disk."""
+    with codecs.open(filename, encoding='utf-8') as corpus:
+        return corpus.read()
+
+
+def build_matrix(text, word_dict, state_size=1):
+    matrix = np.zeros((len(word_dict),) * 2, dtype=np.int32)
+    sentences = nltk.sent_tokenize(text)
+    for sent in sentences:
+        sent = [BEGIN_TOKEN] + nltk.word_tokenize(sent) + [END_TOKEN]
+        for i in range(len(sent) - (state_size + 1)):
+            condition = ' '.join(sent[i:(i + state_size)])
+            sample = sent[(i + state_size)]
+            condition_index = word_dict[condition]
+            sample_index = word_dict[sample]
+            matrix[condition_index][sample_index] += 1
+    return matrix
+
+
+def unique_words(tokenized_text, case_insensitive=False):
+    """Returns an OrderedDict of all unique words in the given text."""
+    word_set = set()
+    # TODO: not great that I'm doing tokenization and looping over them twice...
+    sentences = nltk.sent_tokenize(text)
+    for sent in sentences:
+        sent = nltk.word_tokenize(sent)
+        for word in sent:
+            if case_insensitive:
+                word = word.lower()
+            word_set.add(word)
+    word_set.update(set([BEGIN_TOKEN, END_TOKEN]))
+    return OrderedDict((word, i) for i, word in enumerate(sorted(word_set)))
+
+
+def generate_sentence(matrix, word_dict):
+    sent = []
+    counter = 0
+    choices = np.arange(len(word_dict))
+    # Is it bad to create a new array in the inner loop down there?
+    #  probs = np.zeros((len(word_dict),), dtype=np.float)
+    state = word_dict[BEGIN_TOKEN]
+    # TODO: it's not finding the end token
+    while state != word_dict[END_TOKEN] and counter != 30:
+        probs = matrix[state].astype(np.float)
+        probs /= probs.sum()
+        state = np.random.choice(choices, p=probs)
+        if state != word_dict[END_TOKEN]:
+            sent.append(next(islice(word_dict.items(), int(state), None))[0])
+        counter += 1
+    return ' '.join(sent)
+
+
+if __name__ == '__main__':
+    text = load_text(sys.argv[1])
+    word_dict = unique_words(text)
+    matrix = build_matrix(text, word_dict)
+    print(generate_sentence(matrix, word_dict))