Pushing work from mule

2017-03-13 23:51:07 -04:00 · 2017-03-13 23:51:07 -04:00 · 19aefd163f
commit 19aefd163f
parent 3ace25b6e2
3 changed files with 51 additions and 0 deletions
--- a/generate_random_from_file.py
+++ b/generate_random_from_file.py
@ -0,0 +1,31 @@
+import nltk
+import random
+import string
+import sys
+
+
+def main(text):
+    bigrams = list(nltk.bigrams(
+        [token for token in nltk.word_tokenize(text.decode('utf8'))
+         if set(token).difference(set(string.punctuation))]))
+    cfdist = nltk.ConditionalFreqDist(bigrams)
+    word = random.choice(bigrams)[0]
+    for i in range(155):
+        print word,
+        if i % 3:
+            top_words = tuple(cfdist[word])
+        else:
+            dist = cfdist[word].copy()
+            top_words = []
+            for i in range(3):
+                if dist:
+                    top_words.append(dist.max())
+                    del dist[top_words[-1]]
+                else:
+                    break
+        word = random.choice(top_words)
+
+if __name__ == '__main__':
+    file = sys.argv[1]
+    with open(file, 'r') as f:
+        main(f.read())
--- a/notes.md
+++ b/notes.md
@ -0,0 +1,13 @@
+What needs to be improved about this repo:
+
+Generalize and standardize the steps in an NLP pipeline into python classes and
+functions. I can think of these off the top of my head:
+
+* Scraper - get text from the internet to local file
+* Cleaner - clean raw text of non-corpus text
+* Ngramer - assemble text in python list of lists
+* Cfdister - restructure data into a conditional frequency distribution
+* Other? - restructure data by other metric (rhyming, similarity, etc.)
+* Assembler loop - takes structure above and outputs one word
+    - Maybe should wrap in a sentence loop, line-by-line loop, paragraph loop,
+      etc.
--- a/spacy_gen.py
+++ b/spacy_gen.py
@ -0,0 +1,7 @@
+import spacy
+
+nlp = spacy.load('en')
+doc = nlp(u'They told us to duck.')
+
+for token in doc:
+    print (token.pos_, token.tag_)