|
@@ -0,0 +1,89 @@
|
|
1
|
+import os
|
|
2
|
+import pickle
|
|
3
|
+import random
|
|
4
|
+import nltk
|
|
5
|
+from nltk.tree import Tree
|
|
6
|
+from collections import defaultdict
|
|
7
|
+from tqdm import tqdm
|
|
8
|
+from stat_parser import Parser
|
|
9
|
+
|
|
10
|
+syntaxes = defaultdict(set)
|
|
11
|
+SYNTAXES_FILE = 'syntaxes.p'
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+def tree_hash(self):
|
|
15
|
+ return hash(tuple(self.leaves()))
|
|
16
|
+
|
|
17
|
+Tree.__hash__ = tree_hash
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+def generate():
|
|
21
|
+ global syntaxes
|
|
22
|
+ parser = Parser()
|
|
23
|
+ if not os.path.exists(SYNTAXES_FILE):
|
|
24
|
+ sents = nltk.corpus.gutenberg.sents('melville-moby_dick.txt')
|
|
25
|
+ sents = sents[0:100]
|
|
26
|
+ for sent in tqdm(sents):
|
|
27
|
+ try:
|
|
28
|
+ parsed = parser.parse(' '.join(sent))
|
|
29
|
+ except TypeError:
|
|
30
|
+ pass
|
|
31
|
+ syntax_signature(parsed, save=True)
|
|
32
|
+ with open(SYNTAXES_FILE, 'wb+') as pickle_file:
|
|
33
|
+ pickle.dump(syntaxes, pickle_file)
|
|
34
|
+ else:
|
|
35
|
+ with open(SYNTAXES_FILE, 'rb+') as pickle_file:
|
|
36
|
+ syntaxes = pickle.load(pickle_file)
|
|
37
|
+ sents = nltk.corpus.gutenberg.sents('austen-emma.txt')
|
|
38
|
+ sent = random.choice(sents)
|
|
39
|
+ parsed = parser.parse(' '.join(sent))
|
|
40
|
+ print(parsed)
|
|
41
|
+ print(' '.join(parsed.leaves()))
|
|
42
|
+ replaced_tree = tree_replace(parsed)
|
|
43
|
+ print('='*30)
|
|
44
|
+ print(' '.join(replaced_tree.leaves()))
|
|
45
|
+ print(replaced_tree)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+def list_to_string(l):
|
|
49
|
+ return str(l).replace(" ", "").replace("'", "")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+def syntax_signature(tree, save=False):
|
|
53
|
+ return list_to_string(syntax_signature_recurse(tree, save=save))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+def syntax_signature_recurse(tree, save=False):
|
|
57
|
+ global syntaxes
|
|
58
|
+ if type(tree) is Tree:
|
|
59
|
+ label = tree.label()
|
|
60
|
+ if label == ',':
|
|
61
|
+ label = 'COMMA'
|
|
62
|
+ children = [syntax_signature_recurse(child, save=save) for child in tree if type(child) is Tree]
|
|
63
|
+ if not children:
|
|
64
|
+ if save:
|
|
65
|
+ syntaxes[label].add(tree)
|
|
66
|
+ return label
|
|
67
|
+ else:
|
|
68
|
+ if save:
|
|
69
|
+ syntaxes[list_to_string([label, children])].add(tree)
|
|
70
|
+ return [label, children]
|
|
71
|
+ else:
|
|
72
|
+ raise ValueError('Not a nltk.tree.Tree: {}'.format(tree))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+def tree_replace(tree):
|
|
76
|
+ sig = syntax_signature(tree)
|
|
77
|
+ if sig in syntaxes:
|
|
78
|
+ return random.choice(tuple(syntaxes[sig]))
|
|
79
|
+ else:
|
|
80
|
+ children = [tree_replace(child) for child in tree if type(child) is Tree]
|
|
81
|
+ if not children:
|
|
82
|
+ # unable to replace this leaf
|
|
83
|
+ return tree
|
|
84
|
+ else:
|
|
85
|
+ return Tree(tree.label(), children)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+if __name__ == '__main__':
|
|
89
|
+ generate()
|