|
@@ -0,0 +1,76 @@
|
|
1
|
+"""
|
|
2
|
+My idea here is to encode the entire corpus as one giant two-dimensional numpy array of floats where each row is a
|
|
3
|
+condition word and each column in that row is every other word in the corpus and the probability that the word follows
|
|
4
|
+the conditional word.
|
|
5
|
+"""
|
|
6
|
+from collections import OrderedDict
|
|
7
|
+from itertools import islice
|
|
8
|
+
|
|
9
|
+import codecs
|
|
10
|
+import nltk # TODO: write/import a tokenizer so I don't need to import this
|
|
11
|
+import numpy as np
|
|
12
|
+import sys
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+BEGIN_TOKEN = '__BEGIN__'
|
|
16
|
+END_TOKEN = '__END__'
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+def load_text(filename):
|
|
20
|
+ """Return all text from UTF-8 encoded file on disk."""
|
|
21
|
+ with codecs.open(filename, encoding='utf-8') as corpus:
|
|
22
|
+ return corpus.read()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+def build_matrix(text, word_dict, state_size=1):
|
|
26
|
+ matrix = np.zeros((len(word_dict),) * 2, dtype=np.int32)
|
|
27
|
+ sentences = nltk.sent_tokenize(text)
|
|
28
|
+ for sent in sentences:
|
|
29
|
+ sent = [BEGIN_TOKEN] + nltk.word_tokenize(sent) + [END_TOKEN]
|
|
30
|
+ for i in range(len(sent) - (state_size + 1)):
|
|
31
|
+ condition = ' '.join(sent[i:(i + state_size)])
|
|
32
|
+ sample = sent[(i + state_size)]
|
|
33
|
+ condition_index = word_dict[condition]
|
|
34
|
+ sample_index = word_dict[sample]
|
|
35
|
+ matrix[condition_index][sample_index] += 1
|
|
36
|
+ return matrix
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+def unique_words(tokenized_text, case_insensitive=False):
|
|
40
|
+ """Returns an OrderedDict of all unique words in the given text."""
|
|
41
|
+ word_set = set()
|
|
42
|
+ # TODO: not great that I'm doing tokenization and looping over them twice...
|
|
43
|
+ sentences = nltk.sent_tokenize(text)
|
|
44
|
+ for sent in sentences:
|
|
45
|
+ sent = nltk.word_tokenize(sent)
|
|
46
|
+ for word in sent:
|
|
47
|
+ if case_insensitive:
|
|
48
|
+ word = word.lower()
|
|
49
|
+ word_set.add(word)
|
|
50
|
+ word_set.update(set([BEGIN_TOKEN, END_TOKEN]))
|
|
51
|
+ return OrderedDict((word, i) for i, word in enumerate(sorted(word_set)))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+def generate_sentence(matrix, word_dict):
|
|
55
|
+ sent = []
|
|
56
|
+ counter = 0
|
|
57
|
+ choices = np.arange(len(word_dict))
|
|
58
|
+ # Is it bad to create a new array in the inner loop down there?
|
|
59
|
+ # probs = np.zeros((len(word_dict),), dtype=np.float)
|
|
60
|
+ state = word_dict[BEGIN_TOKEN]
|
|
61
|
+ # TODO: it's not finding the end token
|
|
62
|
+ while state != word_dict[END_TOKEN] and counter != 30:
|
|
63
|
+ probs = matrix[state].astype(np.float)
|
|
64
|
+ probs /= probs.sum()
|
|
65
|
+ state = np.random.choice(choices, p=probs)
|
|
66
|
+ if state != word_dict[END_TOKEN]:
|
|
67
|
+ sent.append(next(islice(word_dict.items(), int(state), None))[0])
|
|
68
|
+ counter += 1
|
|
69
|
+ return ' '.join(sent)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+if __name__ == '__main__':
|
|
73
|
+ text = load_text(sys.argv[1])
|
|
74
|
+ word_dict = unique_words(text)
|
|
75
|
+ matrix = build_matrix(text, word_dict)
|
|
76
|
+ print(generate_sentence(matrix, word_dict))
|