Browse Source

Playing around with swapping nltk syntax trees

Tyler Hallada 8 years ago
parent
commit
92e34b7b76
2 changed files with 91 additions and 0 deletions
  1. 2 0
      .gitignore
  2. 89 0
      syntax_aware_generate.py

+ 2 - 0
.gitignore

@@ -3,3 +3,5 @@
3 3
 *.pyc
4 4
 lib
5 5
 pyStatParser
6
+*.p
7
+*.pickle

+ 89 - 0
syntax_aware_generate.py

@@ -0,0 +1,89 @@
1
+import os
2
+import pickle
3
+import random
4
+import nltk
5
+from nltk.tree import Tree
6
+from collections import defaultdict
7
+from tqdm import tqdm
8
+from stat_parser import Parser
9
+
10
+syntaxes = defaultdict(set)
11
+SYNTAXES_FILE = 'syntaxes.p'
12
+
13
+
14
+def tree_hash(self):
15
+    return hash(tuple(self.leaves()))
16
+
17
+Tree.__hash__ = tree_hash
18
+
19
+
20
+def generate():
21
+    global syntaxes
22
+    parser = Parser()
23
+    if not os.path.exists(SYNTAXES_FILE):
24
+        sents = nltk.corpus.gutenberg.sents('melville-moby_dick.txt')
25
+        sents = sents[0:100]
26
+        for sent in tqdm(sents):
27
+            try:
28
+                parsed = parser.parse(' '.join(sent))
29
+            except TypeError:
30
+                pass
31
+            syntax_signature(parsed, save=True)
32
+        with open(SYNTAXES_FILE, 'wb+') as pickle_file:
33
+            pickle.dump(syntaxes, pickle_file)
34
+    else:
35
+        with open(SYNTAXES_FILE, 'rb+') as pickle_file:
36
+            syntaxes = pickle.load(pickle_file)
37
+    sents = nltk.corpus.gutenberg.sents('austen-emma.txt')
38
+    sent = random.choice(sents)
39
+    parsed = parser.parse(' '.join(sent))
40
+    print(parsed)
41
+    print(' '.join(parsed.leaves()))
42
+    replaced_tree = tree_replace(parsed)
43
+    print('='*30)
44
+    print(' '.join(replaced_tree.leaves()))
45
+    print(replaced_tree)
46
+
47
+
48
+def list_to_string(l):
49
+    return str(l).replace(" ", "").replace("'", "")
50
+
51
+
52
+def syntax_signature(tree, save=False):
53
+    return list_to_string(syntax_signature_recurse(tree, save=save))
54
+
55
+
56
+def syntax_signature_recurse(tree, save=False):
57
+    global syntaxes
58
+    if type(tree) is Tree:
59
+        label = tree.label()
60
+        if label == ',':
61
+            label = 'COMMA'
62
+        children = [syntax_signature_recurse(child, save=save) for child in tree if type(child) is Tree]
63
+        if not children:
64
+            if save:
65
+                syntaxes[label].add(tree)
66
+            return label
67
+        else:
68
+            if save:
69
+                syntaxes[list_to_string([label, children])].add(tree)
70
+            return [label, children]
71
+    else:
72
+        raise ValueError('Not a nltk.tree.Tree: {}'.format(tree))
73
+
74
+
75
+def tree_replace(tree):
76
+    sig = syntax_signature(tree)
77
+    if sig in syntaxes:
78
+        return random.choice(tuple(syntaxes[sig]))
79
+    else:
80
+        children = [tree_replace(child) for child in tree if type(child) is Tree]
81
+        if not children:
82
+            # unable to replace this leaf
83
+            return tree
84
+        else:
85
+            return Tree(tree.label(), children)
86
+
87
+
88
+if __name__ == '__main__':
89
+    generate()