From 92e34b7b769fcb29fdf8c0961e98dcc8f05fc50f Mon Sep 17 00:00:00 2001 From: Tyler Hallada Date: Sun, 1 May 2016 22:34:43 -0400 Subject: [PATCH] Playing around with swapping nltk syntax trees --- .gitignore | 2 + syntax_aware_generate.py | 89 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 syntax_aware_generate.py diff --git a/.gitignore b/.gitignore index a75cc9d..624d945 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ *.pyc lib pyStatParser +*.p +*.pickle diff --git a/syntax_aware_generate.py b/syntax_aware_generate.py new file mode 100644 index 0000000..a9c38c6 --- /dev/null +++ b/syntax_aware_generate.py @@ -0,0 +1,89 @@ +import os +import pickle +import random +import nltk +from nltk.tree import Tree +from collections import defaultdict +from tqdm import tqdm +from stat_parser import Parser + +syntaxes = defaultdict(set) +SYNTAXES_FILE = 'syntaxes.p' + + +def tree_hash(self): + return hash(tuple(self.leaves())) + +Tree.__hash__ = tree_hash + + +def generate(): + global syntaxes + parser = Parser() + if not os.path.exists(SYNTAXES_FILE): + sents = nltk.corpus.gutenberg.sents('melville-moby_dick.txt') + sents = sents[0:100] + for sent in tqdm(sents): + try: + parsed = parser.parse(' '.join(sent)) + except TypeError: + pass + syntax_signature(parsed, save=True) + with open(SYNTAXES_FILE, 'wb+') as pickle_file: + pickle.dump(syntaxes, pickle_file) + else: + with open(SYNTAXES_FILE, 'rb+') as pickle_file: + syntaxes = pickle.load(pickle_file) + sents = nltk.corpus.gutenberg.sents('austen-emma.txt') + sent = random.choice(sents) + parsed = parser.parse(' '.join(sent)) + print(parsed) + print(' '.join(parsed.leaves())) + replaced_tree = tree_replace(parsed) + print('='*30) + print(' '.join(replaced_tree.leaves())) + print(replaced_tree) + + +def list_to_string(l): + return str(l).replace(" ", "").replace("'", "") + + +def syntax_signature(tree, save=False): + return list_to_string(syntax_signature_recurse(tree, save=save)) + + +def syntax_signature_recurse(tree, save=False): + global syntaxes + if type(tree) is Tree: + label = tree.label() + if label == ',': + label = 'COMMA' + children = [syntax_signature_recurse(child, save=save) for child in tree if type(child) is Tree] + if not children: + if save: + syntaxes[label].add(tree) + return label + else: + if save: + syntaxes[list_to_string([label, children])].add(tree) + return [label, children] + else: + raise ValueError('Not a nltk.tree.Tree: {}'.format(tree)) + + +def tree_replace(tree): + sig = syntax_signature(tree) + if sig in syntaxes: + return random.choice(tuple(syntaxes[sig])) + else: + children = [tree_replace(child) for child in tree if type(child) is Tree] + if not children: + # unable to replace this leaf + return tree + else: + return Tree(tree.label(), children) + + +if __name__ == '__main__': + generate()