|
@@ -0,0 +1,107 @@
|
|
1
|
+import codecs
|
|
2
|
+import os
|
|
3
|
+import pickle
|
|
4
|
+import random
|
|
5
|
+
|
|
6
|
+import spacy
|
|
7
|
+
|
|
8
|
+TEMPLATE_CORPUS = 'austencorpus'
|
|
9
|
+CONTENT_CORPUS = 'lovecraftcorpus'
|
|
10
|
+
|
|
11
|
+print('Loading spaCy model... ', end='')
|
|
12
|
+nlp = spacy.load('en_core_web_lg')
|
|
13
|
+print('Done')
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+def load_text_files(dirname):
|
|
17
|
+ for (dirpath, dirnames, filenames) in os.walk(dirname):
|
|
18
|
+ for filename in filenames:
|
|
19
|
+ with codecs.open(os.path.join(dirpath, filename),
|
|
20
|
+ encoding='utf-8') as f:
|
|
21
|
+ yield f.read()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+def load_syntax(dirname):
|
|
25
|
+ full_text = ''
|
|
26
|
+ for text in load_text_files(dirname):
|
|
27
|
+ full_text += text
|
|
28
|
+ return nlp(full_text)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+def load_object_to_file(filename):
|
|
32
|
+ with open(filename, 'rb') as f:
|
|
33
|
+ return pickle.load(f)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+def save_object_to_file(filename, object):
|
|
37
|
+ with open(filename, 'wb') as f:
|
|
38
|
+ pickle.dump(object, f)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+def build_content_dict(content_syntax):
|
|
42
|
+ content_dict = {}
|
|
43
|
+ for word in content_syntax:
|
|
44
|
+ if word.tag not in content_dict:
|
|
45
|
+ content_dict[word.tag] = {}
|
|
46
|
+ if word.dep not in content_dict[word.tag]:
|
|
47
|
+ content_dict[word.tag][word.dep] = set()
|
|
48
|
+ content_dict[word.tag][word.dep].add(word)
|
|
49
|
+ return content_dict
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+def find_closest_content_word(template_word, content_dict):
|
|
53
|
+ closest = None
|
|
54
|
+ closest_score = 0.0
|
|
55
|
+
|
|
56
|
+ if template_word.tag in content_dict:
|
|
57
|
+ if template_word.dep in content_dict[template_word.tag]:
|
|
58
|
+ content_word_set = content_dict[template_word.tag][template_word.dep]
|
|
59
|
+ else:
|
|
60
|
+ random_dep = random.choice(list(content_dict[template_word.tag].keys()))
|
|
61
|
+ content_word_set = content_dict[template_word.tag][random_dep]
|
|
62
|
+ else:
|
|
63
|
+ return None
|
|
64
|
+
|
|
65
|
+ for content_word in content_word_set:
|
|
66
|
+ if closest is None or template_word.similarity(content_word) > closest_score:
|
|
67
|
+ closest = content_word
|
|
68
|
+ closest_score = template_word.similarity(content_word)
|
|
69
|
+
|
|
70
|
+ return closest
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+if __name__ == '__main__':
|
|
74
|
+ if os.path.exists('template_syntax.bin'):
|
|
75
|
+ print('Loading parsed template corpus... ', end='')
|
|
76
|
+ template_syntax = spacy.tokens.Doc(spacy.vocab.Vocab())
|
|
77
|
+ template_syntax.from_disk('template_syntax.bin')
|
|
78
|
+ print('Done')
|
|
79
|
+ else:
|
|
80
|
+ print('Parsing template corpus... ', end='')
|
|
81
|
+ template_syntax = load_syntax(TEMPLATE_CORPUS)
|
|
82
|
+ template_syntax.to_disk('template_syntax.bin')
|
|
83
|
+ print('Done')
|
|
84
|
+
|
|
85
|
+ if os.path.exists('content_syntax.bin'):
|
|
86
|
+ print('Loading parsed content corpus... ', end='')
|
|
87
|
+ content_syntax = spacy.tokens.Doc(spacy.vocab.Vocab())
|
|
88
|
+ content_syntax.from_disk('content_syntax.bin')
|
|
89
|
+ print('Done')
|
|
90
|
+ else:
|
|
91
|
+ print('Parsing content corpus... ', end='')
|
|
92
|
+ content_syntax = load_syntax(CONTENT_CORPUS)
|
|
93
|
+ content_syntax.to_disk('content_syntax.bin')
|
|
94
|
+ print('Done')
|
|
95
|
+
|
|
96
|
+ print('Building content_dict... ', end='')
|
|
97
|
+ content_dict = build_content_dict(content_syntax)
|
|
98
|
+ save_object_to_file('content_dict.bin', content_dict)
|
|
99
|
+ print('Done')
|
|
100
|
+
|
|
101
|
+ for template_word in template_syntax[0:100]:
|
|
102
|
+ closest_word = find_closest_content_word(template_word, content_dict)
|
|
103
|
+ if closest_word:
|
|
104
|
+ print(closest_word.text_with_ws, end='')
|
|
105
|
+ else:
|
|
106
|
+ print('<NOMATCH> ', end='')
|
|
107
|
+ import ipdb; ipdb.set_trace()
|