Browse Source

Pushing work from mule

Tyler Hallada 7 years ago
parent
commit
19aefd163f
3 changed files with 51 additions and 0 deletions
  1. 31 0
      generate_random_from_file.py
  2. 13 0
      notes.md
  3. 7 0
      spacy_gen.py

+ 31 - 0
generate_random_from_file.py

@@ -0,0 +1,31 @@
1
+import nltk
2
+import random
3
+import string
4
+import sys
5
+
6
+
7
+def main(text):
8
+    bigrams = list(nltk.bigrams(
9
+        [token for token in nltk.word_tokenize(text.decode('utf8'))
10
+         if set(token).difference(set(string.punctuation))]))
11
+    cfdist = nltk.ConditionalFreqDist(bigrams)
12
+    word = random.choice(bigrams)[0]
13
+    for i in range(155):
14
+        print word,
15
+        if i % 3:
16
+            top_words = tuple(cfdist[word])
17
+        else:
18
+            dist = cfdist[word].copy()
19
+            top_words = []
20
+            for i in range(3):
21
+                if dist:
22
+                    top_words.append(dist.max())
23
+                    del dist[top_words[-1]]
24
+                else:
25
+                    break
26
+        word = random.choice(top_words)
27
+
28
+if __name__ == '__main__':
29
+    file = sys.argv[1]
30
+    with open(file, 'r') as f:
31
+        main(f.read())

+ 13 - 0
notes.md

@@ -0,0 +1,13 @@
1
+What needs to be improved about this repo:
2
+
3
+Generalize and standardize the steps in an NLP pipeline into python classes and
4
+functions. I can think of these off the top of my head:
5
+
6
+* Scraper - get text from the internet to local file
7
+* Cleaner - clean raw text of non-corpus text
8
+* Ngramer - assemble text in python list of lists
9
+* Cfdister - restructure data into a conditional frequency distribution
10
+* Other? - restructure data by other metric (rhyming, similarity, etc.)
11
+* Assembler loop - takes structure above and outputs one word
12
+    - Maybe should wrap in a sentence loop, line-by-line loop, paragraph loop,
13
+      etc.

+ 7 - 0
spacy_gen.py

@@ -0,0 +1,7 @@
1
+import spacy
2
+
3
+nlp = spacy.load('en')
4
+doc = nlp(u'They told us to duck.')
5
+
6
+for token in doc:
7
+    print (token.pos_, token.tag_)