Rudimentary impl. of markov chain in a numpy array
This commit is contained in:
parent
d922297f99
commit
35792b6261
76
markov_matrix.py
Normal file
76
markov_matrix.py
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
"""
|
||||||
|
My idea here is to encode the entire corpus as one giant two-dimensional numpy array of floats where each row is a
|
||||||
|
condition word and each column in that row is every other word in the corpus and the probability that the word follows
|
||||||
|
the conditional word.
|
||||||
|
"""
|
||||||
|
from collections import OrderedDict
|
||||||
|
from itertools import islice
|
||||||
|
|
||||||
|
import codecs
|
||||||
|
import nltk # TODO: write/import a tokenizer so I don't need to import this
|
||||||
|
import numpy as np
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
BEGIN_TOKEN = '__BEGIN__'
|
||||||
|
END_TOKEN = '__END__'
|
||||||
|
|
||||||
|
|
||||||
|
def load_text(filename):
|
||||||
|
"""Return all text from UTF-8 encoded file on disk."""
|
||||||
|
with codecs.open(filename, encoding='utf-8') as corpus:
|
||||||
|
return corpus.read()
|
||||||
|
|
||||||
|
|
||||||
|
def build_matrix(text, word_dict, state_size=1):
|
||||||
|
matrix = np.zeros((len(word_dict),) * 2, dtype=np.int32)
|
||||||
|
sentences = nltk.sent_tokenize(text)
|
||||||
|
for sent in sentences:
|
||||||
|
sent = [BEGIN_TOKEN] + nltk.word_tokenize(sent) + [END_TOKEN]
|
||||||
|
for i in range(len(sent) - (state_size + 1)):
|
||||||
|
condition = ' '.join(sent[i:(i + state_size)])
|
||||||
|
sample = sent[(i + state_size)]
|
||||||
|
condition_index = word_dict[condition]
|
||||||
|
sample_index = word_dict[sample]
|
||||||
|
matrix[condition_index][sample_index] += 1
|
||||||
|
return matrix
|
||||||
|
|
||||||
|
|
||||||
|
def unique_words(tokenized_text, case_insensitive=False):
|
||||||
|
"""Returns an OrderedDict of all unique words in the given text."""
|
||||||
|
word_set = set()
|
||||||
|
# TODO: not great that I'm doing tokenization and looping over them twice...
|
||||||
|
sentences = nltk.sent_tokenize(text)
|
||||||
|
for sent in sentences:
|
||||||
|
sent = nltk.word_tokenize(sent)
|
||||||
|
for word in sent:
|
||||||
|
if case_insensitive:
|
||||||
|
word = word.lower()
|
||||||
|
word_set.add(word)
|
||||||
|
word_set.update(set([BEGIN_TOKEN, END_TOKEN]))
|
||||||
|
return OrderedDict((word, i) for i, word in enumerate(sorted(word_set)))
|
||||||
|
|
||||||
|
|
||||||
|
def generate_sentence(matrix, word_dict):
|
||||||
|
sent = []
|
||||||
|
counter = 0
|
||||||
|
choices = np.arange(len(word_dict))
|
||||||
|
# Is it bad to create a new array in the inner loop down there?
|
||||||
|
# probs = np.zeros((len(word_dict),), dtype=np.float)
|
||||||
|
state = word_dict[BEGIN_TOKEN]
|
||||||
|
# TODO: it's not finding the end token
|
||||||
|
while state != word_dict[END_TOKEN] and counter != 30:
|
||||||
|
probs = matrix[state].astype(np.float)
|
||||||
|
probs /= probs.sum()
|
||||||
|
state = np.random.choice(choices, p=probs)
|
||||||
|
if state != word_dict[END_TOKEN]:
|
||||||
|
sent.append(next(islice(word_dict.items(), int(state), None))[0])
|
||||||
|
counter += 1
|
||||||
|
return ' '.join(sent)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
text = load_text(sys.argv[1])
|
||||||
|
word_dict = unique_words(text)
|
||||||
|
matrix = build_matrix(text, word_dict)
|
||||||
|
print(generate_sentence(matrix, word_dict))
|
Loading…
Reference in New Issue
Block a user