Browse Source

Rudimentary impl. of markov chain in a numpy array

Tyler Hallada 6 years ago
parent
commit
35792b6261
1 changed files with 76 additions and 0 deletions
  1. 76 0
      markov_matrix.py

+ 76 - 0
markov_matrix.py

@@ -0,0 +1,76 @@
1
+"""
2
+My idea here is to encode the entire corpus as one giant two-dimensional numpy array of floats where each row is a
3
+condition word and each column in that row is every other word in the corpus and the probability that the word follows
4
+the conditional word.
5
+"""
6
+from collections import OrderedDict
7
+from itertools import islice
8
+
9
+import codecs
10
+import nltk  # TODO: write/import a tokenizer so I don't need to import this
11
+import numpy as np
12
+import sys
13
+
14
+
15
+BEGIN_TOKEN = '__BEGIN__'
16
+END_TOKEN = '__END__'
17
+
18
+
19
+def load_text(filename):
20
+    """Return all text from UTF-8 encoded file on disk."""
21
+    with codecs.open(filename, encoding='utf-8') as corpus:
22
+        return corpus.read()
23
+
24
+
25
+def build_matrix(text, word_dict, state_size=1):
26
+    matrix = np.zeros((len(word_dict),) * 2, dtype=np.int32)
27
+    sentences = nltk.sent_tokenize(text)
28
+    for sent in sentences:
29
+        sent = [BEGIN_TOKEN] + nltk.word_tokenize(sent) + [END_TOKEN]
30
+        for i in range(len(sent) - (state_size + 1)):
31
+            condition = ' '.join(sent[i:(i + state_size)])
32
+            sample = sent[(i + state_size)]
33
+            condition_index = word_dict[condition]
34
+            sample_index = word_dict[sample]
35
+            matrix[condition_index][sample_index] += 1
36
+    return matrix
37
+
38
+
39
+def unique_words(tokenized_text, case_insensitive=False):
40
+    """Returns an OrderedDict of all unique words in the given text."""
41
+    word_set = set()
42
+    # TODO: not great that I'm doing tokenization and looping over them twice...
43
+    sentences = nltk.sent_tokenize(text)
44
+    for sent in sentences:
45
+        sent = nltk.word_tokenize(sent)
46
+        for word in sent:
47
+            if case_insensitive:
48
+                word = word.lower()
49
+            word_set.add(word)
50
+    word_set.update(set([BEGIN_TOKEN, END_TOKEN]))
51
+    return OrderedDict((word, i) for i, word in enumerate(sorted(word_set)))
52
+
53
+
54
+def generate_sentence(matrix, word_dict):
55
+    sent = []
56
+    counter = 0
57
+    choices = np.arange(len(word_dict))
58
+    # Is it bad to create a new array in the inner loop down there?
59
+    #  probs = np.zeros((len(word_dict),), dtype=np.float)
60
+    state = word_dict[BEGIN_TOKEN]
61
+    # TODO: it's not finding the end token
62
+    while state != word_dict[END_TOKEN] and counter != 30:
63
+        probs = matrix[state].astype(np.float)
64
+        probs /= probs.sum()
65
+        state = np.random.choice(choices, p=probs)
66
+        if state != word_dict[END_TOKEN]:
67
+            sent.append(next(islice(word_dict.items(), int(state), None))[0])
68
+        counter += 1
69
+    return ' '.join(sent)
70
+
71
+
72
+if __name__ == '__main__':
73
+    text = load_text(sys.argv[1])
74
+    word_dict = unique_words(text)
75
+    matrix = build_matrix(text, word_dict)
76
+    print(generate_sentence(matrix, word_dict))