|
@@ -2,14 +2,22 @@
|
2
|
2
|
My idea here is to encode the entire corpus as one giant two-dimensional numpy array of floats where each row is a
|
3
|
3
|
condition word and each column in that row is every other word in the corpus and the probability that the word follows
|
4
|
4
|
the conditional word.
|
|
5
|
+
|
|
6
|
+This was an interesting idea, but ultimately not that useful since the resulting numpy array is significantly larger
|
|
7
|
+than just storing the CFD in a python dictionary. There might be some crazy linear algebra I could run to compress this
|
|
8
|
+array to make it less sparse. But, I would need to use the same N words for all corpora and I think that the resulting
|
|
9
|
+compressed arrays would only be really useful for comparing with each other to find things like "closeness" between two
|
|
10
|
+corpora as defined by the probabilities that some words follow other words in the text. Also, using the same N words
|
|
11
|
+across all corpora is less awesome because you will miss out on the unique words (names, proper nouns, etc.) present in
|
|
12
|
+only some corpora.
|
5
|
13
|
"""
|
|
14
|
+import codecs
|
|
15
|
+import sys
|
6
|
16
|
from collections import OrderedDict
|
7
|
17
|
from itertools import islice
|
8
|
18
|
|
9
|
|
-import codecs
|
10
|
19
|
import nltk # TODO: write/import a tokenizer so I don't need to import this
|
11
|
20
|
import numpy as np
|
12
|
|
-import sys
|
13
|
21
|
|
14
|
22
|
|
15
|
23
|
BEGIN_TOKEN = '__BEGIN__'
|