90 lines
2.4 KiB
Python
90 lines
2.4 KiB
Python
|
import os
|
||
|
import pickle
|
||
|
import random
|
||
|
import nltk
|
||
|
from nltk.tree import Tree
|
||
|
from collections import defaultdict
|
||
|
from tqdm import tqdm
|
||
|
from stat_parser import Parser
|
||
|
|
||
|
syntaxes = defaultdict(set)
|
||
|
SYNTAXES_FILE = 'syntaxes.p'
|
||
|
|
||
|
|
||
|
def tree_hash(self):
|
||
|
return hash(tuple(self.leaves()))
|
||
|
|
||
|
Tree.__hash__ = tree_hash
|
||
|
|
||
|
|
||
|
def generate():
|
||
|
global syntaxes
|
||
|
parser = Parser()
|
||
|
if not os.path.exists(SYNTAXES_FILE):
|
||
|
sents = nltk.corpus.gutenberg.sents('melville-moby_dick.txt')
|
||
|
sents = sents[0:100]
|
||
|
for sent in tqdm(sents):
|
||
|
try:
|
||
|
parsed = parser.parse(' '.join(sent))
|
||
|
except TypeError:
|
||
|
pass
|
||
|
syntax_signature(parsed, save=True)
|
||
|
with open(SYNTAXES_FILE, 'wb+') as pickle_file:
|
||
|
pickle.dump(syntaxes, pickle_file)
|
||
|
else:
|
||
|
with open(SYNTAXES_FILE, 'rb+') as pickle_file:
|
||
|
syntaxes = pickle.load(pickle_file)
|
||
|
sents = nltk.corpus.gutenberg.sents('austen-emma.txt')
|
||
|
sent = random.choice(sents)
|
||
|
parsed = parser.parse(' '.join(sent))
|
||
|
print(parsed)
|
||
|
print(' '.join(parsed.leaves()))
|
||
|
replaced_tree = tree_replace(parsed)
|
||
|
print('='*30)
|
||
|
print(' '.join(replaced_tree.leaves()))
|
||
|
print(replaced_tree)
|
||
|
|
||
|
|
||
|
def list_to_string(l):
|
||
|
return str(l).replace(" ", "").replace("'", "")
|
||
|
|
||
|
|
||
|
def syntax_signature(tree, save=False):
|
||
|
return list_to_string(syntax_signature_recurse(tree, save=save))
|
||
|
|
||
|
|
||
|
def syntax_signature_recurse(tree, save=False):
|
||
|
global syntaxes
|
||
|
if type(tree) is Tree:
|
||
|
label = tree.label()
|
||
|
if label == ',':
|
||
|
label = 'COMMA'
|
||
|
children = [syntax_signature_recurse(child, save=save) for child in tree if type(child) is Tree]
|
||
|
if not children:
|
||
|
if save:
|
||
|
syntaxes[label].add(tree)
|
||
|
return label
|
||
|
else:
|
||
|
if save:
|
||
|
syntaxes[list_to_string([label, children])].add(tree)
|
||
|
return [label, children]
|
||
|
else:
|
||
|
raise ValueError('Not a nltk.tree.Tree: {}'.format(tree))
|
||
|
|
||
|
|
||
|
def tree_replace(tree):
|
||
|
sig = syntax_signature(tree)
|
||
|
if sig in syntaxes:
|
||
|
return random.choice(tuple(syntaxes[sig]))
|
||
|
else:
|
||
|
children = [tree_replace(child) for child in tree if type(child) is Tree]
|
||
|
if not children:
|
||
|
# unable to replace this leaf
|
||
|
return tree
|
||
|
else:
|
||
|
return Tree(tree.label(), children)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
generate()
|