Rudimentary impl. of markov chain in a numpy array
This commit is contained in:
parent
d922297f99
commit
35792b6261
76
markov_matrix.py
Normal file
76
markov_matrix.py
Normal file
@ -0,0 +1,76 @@
|
||||
"""
|
||||
My idea here is to encode the entire corpus as one giant two-dimensional numpy array of floats where each row is a
|
||||
condition word and each column in that row is every other word in the corpus and the probability that the word follows
|
||||
the conditional word.
|
||||
"""
|
||||
from collections import OrderedDict
|
||||
from itertools import islice
|
||||
|
||||
import codecs
|
||||
import nltk # TODO: write/import a tokenizer so I don't need to import this
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
|
||||
BEGIN_TOKEN = '__BEGIN__'
|
||||
END_TOKEN = '__END__'
|
||||
|
||||
|
||||
def load_text(filename):
|
||||
"""Return all text from UTF-8 encoded file on disk."""
|
||||
with codecs.open(filename, encoding='utf-8') as corpus:
|
||||
return corpus.read()
|
||||
|
||||
|
||||
def build_matrix(text, word_dict, state_size=1):
|
||||
matrix = np.zeros((len(word_dict),) * 2, dtype=np.int32)
|
||||
sentences = nltk.sent_tokenize(text)
|
||||
for sent in sentences:
|
||||
sent = [BEGIN_TOKEN] + nltk.word_tokenize(sent) + [END_TOKEN]
|
||||
for i in range(len(sent) - (state_size + 1)):
|
||||
condition = ' '.join(sent[i:(i + state_size)])
|
||||
sample = sent[(i + state_size)]
|
||||
condition_index = word_dict[condition]
|
||||
sample_index = word_dict[sample]
|
||||
matrix[condition_index][sample_index] += 1
|
||||
return matrix
|
||||
|
||||
|
||||
def unique_words(tokenized_text, case_insensitive=False):
|
||||
"""Returns an OrderedDict of all unique words in the given text."""
|
||||
word_set = set()
|
||||
# TODO: not great that I'm doing tokenization and looping over them twice...
|
||||
sentences = nltk.sent_tokenize(text)
|
||||
for sent in sentences:
|
||||
sent = nltk.word_tokenize(sent)
|
||||
for word in sent:
|
||||
if case_insensitive:
|
||||
word = word.lower()
|
||||
word_set.add(word)
|
||||
word_set.update(set([BEGIN_TOKEN, END_TOKEN]))
|
||||
return OrderedDict((word, i) for i, word in enumerate(sorted(word_set)))
|
||||
|
||||
|
||||
def generate_sentence(matrix, word_dict):
|
||||
sent = []
|
||||
counter = 0
|
||||
choices = np.arange(len(word_dict))
|
||||
# Is it bad to create a new array in the inner loop down there?
|
||||
# probs = np.zeros((len(word_dict),), dtype=np.float)
|
||||
state = word_dict[BEGIN_TOKEN]
|
||||
# TODO: it's not finding the end token
|
||||
while state != word_dict[END_TOKEN] and counter != 30:
|
||||
probs = matrix[state].astype(np.float)
|
||||
probs /= probs.sum()
|
||||
state = np.random.choice(choices, p=probs)
|
||||
if state != word_dict[END_TOKEN]:
|
||||
sent.append(next(islice(word_dict.items(), int(state), None))[0])
|
||||
counter += 1
|
||||
return ' '.join(sent)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
text = load_text(sys.argv[1])
|
||||
word_dict = unique_words(text)
|
||||
matrix = build_matrix(text, word_dict)
|
||||
print(generate_sentence(matrix, word_dict))
|
Loading…
Reference in New Issue
Block a user