Various scripts for playing around with natural language processing/generation

syntax_gen.py 3.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. import codecs
  2. import os
  3. import pickle
  4. import random
  5. import spacy
  6. TEMPLATE_CORPUS = 'austencorpus'
  7. CONTENT_CORPUS = 'lovecraftcorpus'
  8. print('Loading spaCy model... ', end='')
  9. nlp = spacy.load('en_core_web_lg')
  10. print('Done')
  11. def load_text_files(dirname):
  12. for (dirpath, dirnames, filenames) in os.walk(dirname):
  13. for filename in filenames:
  14. with codecs.open(os.path.join(dirpath, filename),
  15. encoding='utf-8') as f:
  16. yield f.read()
  17. def load_syntax(dirname):
  18. full_text = ''
  19. for text in load_text_files(dirname):
  20. full_text += text
  21. return nlp(full_text)
  22. def load_object_to_file(filename):
  23. with open(filename, 'rb') as f:
  24. return pickle.load(f)
  25. def save_object_to_file(filename, object):
  26. with open(filename, 'wb') as f:
  27. pickle.dump(object, f)
  28. def build_content_dict(content_syntax):
  29. content_dict = {}
  30. for word in content_syntax:
  31. if word.tag not in content_dict:
  32. content_dict[word.tag] = {}
  33. if word.dep not in content_dict[word.tag]:
  34. content_dict[word.tag][word.dep] = set()
  35. content_dict[word.tag][word.dep].add(word)
  36. return content_dict
  37. def find_closest_content_word(template_word, content_dict):
  38. closest = None
  39. closest_score = 0.0
  40. if template_word.tag in content_dict:
  41. if template_word.dep in content_dict[template_word.tag]:
  42. content_word_set = content_dict[template_word.tag][template_word.dep]
  43. else:
  44. random_dep = random.choice(list(content_dict[template_word.tag].keys()))
  45. content_word_set = content_dict[template_word.tag][random_dep]
  46. else:
  47. return None
  48. for content_word in content_word_set:
  49. if closest is None or template_word.similarity(content_word) > closest_score:
  50. closest = content_word
  51. closest_score = template_word.similarity(content_word)
  52. return closest
  53. if __name__ == '__main__':
  54. if os.path.exists('template_syntax.bin'):
  55. print('Loading parsed template corpus... ', end='')
  56. template_syntax = spacy.tokens.Doc(spacy.vocab.Vocab())
  57. template_syntax.from_disk('template_syntax.bin')
  58. print('Done')
  59. else:
  60. print('Parsing template corpus... ', end='')
  61. template_syntax = load_syntax(TEMPLATE_CORPUS)
  62. template_syntax.to_disk('template_syntax.bin')
  63. print('Done')
  64. if os.path.exists('content_syntax.bin'):
  65. print('Loading parsed content corpus... ', end='')
  66. content_syntax = spacy.tokens.Doc(spacy.vocab.Vocab())
  67. content_syntax.from_disk('content_syntax.bin')
  68. print('Done')
  69. else:
  70. print('Parsing content corpus... ', end='')
  71. content_syntax = load_syntax(CONTENT_CORPUS)
  72. content_syntax.to_disk('content_syntax.bin')
  73. print('Done')
  74. print('Building content_dict... ', end='')
  75. content_dict = build_content_dict(content_syntax)
  76. save_object_to_file('content_dict.bin', content_dict)
  77. print('Done')
  78. for template_word in template_syntax[0:100]:
  79. closest_word = find_closest_content_word(template_word, content_dict)
  80. if closest_word:
  81. print(closest_word.text_with_ws, end='')
  82. else:
  83. print('<NOMATCH> ', end='')
  84. import ipdb; ipdb.set_trace()