Browse Source

Re-doing syntax swapping with spaCy 2.0

Tyler Hallada 6 years ago
parent
commit
e3ab8d60db
1 changed files with 107 additions and 0 deletions
  1. 107 0
      syntax_gen.py

+ 107 - 0
syntax_gen.py

@@ -0,0 +1,107 @@
1
+import codecs
2
+import os
3
+import pickle
4
+import random
5
+
6
+import spacy
7
+
8
+TEMPLATE_CORPUS = 'austencorpus'
9
+CONTENT_CORPUS = 'lovecraftcorpus'
10
+
11
+print('Loading spaCy model... ', end='')
12
+nlp = spacy.load('en_core_web_lg')
13
+print('Done')
14
+
15
+
16
+def load_text_files(dirname):
17
+    for (dirpath, dirnames, filenames) in os.walk(dirname):
18
+        for filename in filenames:
19
+            with codecs.open(os.path.join(dirpath, filename),
20
+                             encoding='utf-8') as f:
21
+                yield f.read()
22
+
23
+
24
+def load_syntax(dirname):
25
+    full_text = ''
26
+    for text in load_text_files(dirname):
27
+        full_text += text
28
+    return nlp(full_text)
29
+
30
+
31
+def load_object_to_file(filename):
32
+    with open(filename, 'rb') as f:
33
+        return pickle.load(f)
34
+
35
+
36
+def save_object_to_file(filename, object):
37
+    with open(filename, 'wb') as f:
38
+        pickle.dump(object, f)
39
+
40
+
41
+def build_content_dict(content_syntax):
42
+    content_dict = {}
43
+    for word in content_syntax:
44
+        if word.tag not in content_dict:
45
+            content_dict[word.tag] = {}
46
+        if word.dep not in content_dict[word.tag]:
47
+            content_dict[word.tag][word.dep] = set()
48
+        content_dict[word.tag][word.dep].add(word)
49
+    return content_dict
50
+
51
+
52
+def find_closest_content_word(template_word, content_dict):
53
+    closest = None
54
+    closest_score = 0.0
55
+
56
+    if template_word.tag in content_dict:
57
+        if template_word.dep in content_dict[template_word.tag]:
58
+            content_word_set = content_dict[template_word.tag][template_word.dep]
59
+        else:
60
+            random_dep = random.choice(list(content_dict[template_word.tag].keys()))
61
+            content_word_set = content_dict[template_word.tag][random_dep]
62
+    else:
63
+        return None
64
+
65
+    for content_word in content_word_set:
66
+        if closest is None or template_word.similarity(content_word) > closest_score:
67
+            closest = content_word
68
+            closest_score = template_word.similarity(content_word)
69
+
70
+    return closest
71
+
72
+
73
+if __name__ == '__main__':
74
+    if os.path.exists('template_syntax.bin'):
75
+        print('Loading parsed template corpus... ', end='')
76
+        template_syntax = spacy.tokens.Doc(spacy.vocab.Vocab())
77
+        template_syntax.from_disk('template_syntax.bin')
78
+        print('Done')
79
+    else:
80
+        print('Parsing template corpus... ', end='')
81
+        template_syntax = load_syntax(TEMPLATE_CORPUS)
82
+        template_syntax.to_disk('template_syntax.bin')
83
+        print('Done')
84
+
85
+    if os.path.exists('content_syntax.bin'):
86
+        print('Loading parsed content corpus... ', end='')
87
+        content_syntax = spacy.tokens.Doc(spacy.vocab.Vocab())
88
+        content_syntax.from_disk('content_syntax.bin')
89
+        print('Done')
90
+    else:
91
+        print('Parsing content corpus... ', end='')
92
+        content_syntax = load_syntax(CONTENT_CORPUS)
93
+        content_syntax.to_disk('content_syntax.bin')
94
+        print('Done')
95
+
96
+    print('Building content_dict... ', end='')
97
+    content_dict = build_content_dict(content_syntax)
98
+    save_object_to_file('content_dict.bin', content_dict)
99
+    print('Done')
100
+
101
+    for template_word in template_syntax[0:100]:
102
+        closest_word = find_closest_content_word(template_word, content_dict)
103
+        if closest_word:
104
+            print(closest_word.text_with_ws, end='')
105
+        else:
106
+            print('<NOMATCH> ', end='')
107
+    import ipdb; ipdb.set_trace()