Browse Source

Allow generating poems from raw text

The filename of the raw text is supplied via the first command line argument to
the python script call.
Tyler Hallada 7 years ago
parent
commit
fa8bd171a1
3 changed files with 54 additions and 23 deletions
  1. 29 18
      generate_poem.py
  2. 20 0
      json_to_txt.py
  3. 5 5
      syntax_aware_generate.py

+ 29 - 18
generate_poem.py

@@ -1,3 +1,4 @@
1
+import codecs
1 2
 import nltk
2 3
 import random
3 4
 import re
@@ -32,22 +33,31 @@ class PoemGenerator():
32 33
         self.words = []
33 34
         self.all_words = []
34 35
         self.inflect_engine = inflect.engine()
35
-        with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
36
-            reader = csv.reader(statuses, delimiter=',')
37
-            for row in reader:
38
-                if 'via buzzfeed ' not in row[1].lower():  # only English
39
-                    # split title into a list of words and punctuation
40
-                    title = self.spaces_and_punctuation.findall(row[2])
41
-                    # spell out digits into ordinal words for syllable counting
42
-                    title = [string.capwords(
43
-                             self.inflect_engine.number_to_words(int(word)))
44
-                             if word.isdigit() else word for word in title]
45
-                    self.sents.append(title)
46
-                    self.words.extend(title)
47
-                    # all_words only contains words, no punctuation
48
-                    self.all_words.extend([word for word in title
49
-                                           if not
50
-                                           self.only_punctuation.match(word)])
36
+        #  with open('/var/www/buzzfeed-haiku-generator/buzzfeed_facebook_statuses.csv', newline='', encoding='utf-8') as statuses:
37
+            #  reader = csv.reader(statuses, delimiter=',')
38
+            #  for row in reader:
39
+                #  if 'via buzzfeed ' not in row[1].lower():  # only English
40
+                    #  # split title into a list of words and punctuation
41
+                    #  title = self.spaces_and_punctuation.findall(row[2])
42
+                    #  # spell out digits into ordinal words for syllable counting
43
+                    #  title = [string.capwords(
44
+                             #  self.inflect_engine.number_to_words(int(word)))
45
+                             #  if word.isdigit() else word for word in title]
46
+                    #  self.sents.append(title)
47
+                    #  self.words.extend(title)
48
+                    #  # all_words only contains words, no punctuation
49
+                    #  self.all_words.extend([word for word in title
50
+                                           #  if not
51
+                                           #  self.only_punctuation.match(word)])
52
+        with codecs.open('trump.txt', 'r', 'utf-8') as corpus:
53
+            text = corpus.read()
54
+            sents = nltk.tokenize.sent_tokenize(text)
55
+            words = nltk.tokenize.word_tokenize(text)
56
+            self.sents.extend(sents)
57
+            self.words.extend(words)
58
+            self.all_words.extend([word for word in words
59
+                                   if not
60
+                                   self.only_punctuation.match(word)])
51 61
         self.bigrams = list(nltk.bigrams(self.words))
52 62
         self.cfd = nltk.ConditionalFreqDist(self.bigrams)
53 63
         #self.parser = Parser()
@@ -129,7 +139,8 @@ class PoemGenerator():
129 139
 
130 140
 
131 141
 if __name__ == '__main__':
132
-    generator = PoemGenerator(nltk.corpus.gutenberg)
142
+    generator = PoemGenerator('poop')
133 143
     #generator.generate_poem()
134
-    generator.generate_haiku()
144
+    haiku = generator.generate_haiku()
145
+    print haiku
135 146
     #generator.generate_endless_poem(None)

+ 20 - 0
json_to_txt.py

@@ -0,0 +1,20 @@
1
+# Converts a json twitter dump to raw text file.
2
+import codecs
3
+import json
4
+import sys
5
+
6
+
7
+def get_text_from_json(filename):
8
+    with codecs.open(filename, 'r', 'utf-8') as f:
9
+        return [item['text'] for item in json.loads(f.read())]
10
+
11
+
12
+def write_text_to_file(filename, text_array, delimiter=' '):
13
+    text_to_write = delimiter.join(text_array)
14
+    with codecs.open(filename, 'w', 'utf-8') as f:
15
+        f.write(text_to_write)
16
+
17
+
18
+if __name__ == '__main__':
19
+    text_array = get_text_from_json(sys.argv[1])
20
+    write_text_to_file(sys.argv[2], text_array)

+ 5 - 5
syntax_aware_generate.py

@@ -5,6 +5,7 @@ import pickle
5 5
 import random
6 6
 import re
7 7
 import codecs
8
+import sys
8 9
 from nltk.tree import Tree
9 10
 from collections import defaultdict
10 11
 from tqdm import tqdm
@@ -28,13 +29,13 @@ Tree.__hash__ = tree_hash
28 29
 # corpora. Shitty bus wifi makes it hard to download spacy data and look up the docs.
29 30
 
30 31
 
31
-def generate():
32
+def generate(filename):
32 33
     global syntaxes
33 34
     parser = Parser()
34 35
     if not os.path.exists(SYNTAXES_FILE):
35 36
         #  sents = nltk.corpus.gutenberg.sents('results.txt')
36 37
         # NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus.
37
-        with codecs.open('results.txt', encoding='utf-8') as corpus:
38
+        with codecs.open(filename, encoding='utf-8') as corpus:
38 39
             sents = nltk.sent_tokenize(corpus.read())
39 40
             sents = [sent for sent in sents if len(sent) < 150][0:1500]
40 41
             for sent in tqdm(sents):
@@ -50,8 +51,7 @@ def generate():
50 51
             syntaxes = pickle.load(pickle_file)
51 52
 
52 53
     if not os.path.exists(CFDS_FILE):
53
-        #  corpus = nltk.corpus.gutenberg.raw('results.txt')
54
-        with codecs.open('results.txt', encoding='utf-8') as corpus:
54
+        with codecs.open(filename, encoding='utf-8') as corpus:
55 55
             cfds = [make_cfd(corpus.read(), i, exclude_punctuation=False, case_insensitive=True) for i in range(2, 5)]
56 56
             with open(CFDS_FILE, 'wb+') as pickle_file:
57 57
                 pickle.dump(cfds, pickle_file)
@@ -165,4 +165,4 @@ def get_most_common(search, cfds, most_common=None):
165 165
 
166 166
 
167 167
 if __name__ == '__main__':
168
-    generate()
168
+    generate(sys.argv[1])