Pushing work from mule

2017-03-13 23:51:07 -04:00
parent 3ace25b6e2
commit 19aefd163f
3 changed files with 51 additions and 0 deletions
--- a/generate_random_from_file.py
+++ b/generate_random_from_file.py
@@ -0,0 +1,31 @@
 import nltk
 import random
 import string
 import sys
 def main(text):
    bigrams = list(nltk.bigrams(
        [token for token in nltk.word_tokenize(text.decode('utf8'))
         if set(token).difference(set(string.punctuation))]))
    cfdist = nltk.ConditionalFreqDist(bigrams)
    word = random.choice(bigrams)[0]
    for i in range(155):
        print word,
        if i % 3:
            top_words = tuple(cfdist[word])
        else:
            dist = cfdist[word].copy()
            top_words = []
            for i in range(3):
                if dist:
                    top_words.append(dist.max())
                    del dist[top_words[-1]]
                else:
                    break
        word = random.choice(top_words)
 if __name__ == '__main__':
    file = sys.argv[1]
    with open(file, 'r') as f:
        main(f.read())
--- a/notes.md
+++ b/notes.md
@@ -0,0 +1,13 @@
 What needs to be improved about this repo:
 Generalize and standardize the steps in an NLP pipeline into python classes and
 functions. I can think of these off the top of my head:
 * Scraper - get text from the internet to local file
 * Cleaner - clean raw text of non-corpus text
 * Ngramer - assemble text in python list of lists
 * Cfdister - restructure data into a conditional frequency distribution
 * Other? - restructure data by other metric (rhyming, similarity, etc.)
 * Assembler loop - takes structure above and outputs one word
    - Maybe should wrap in a sentence loop, line-by-line loop, paragraph loop,
      etc.
--- a/spacy_gen.py
+++ b/spacy_gen.py
@@ -0,0 +1,7 @@
 import spacy
 nlp = spacy.load('en')
 doc = nlp(u'They told us to duck.')
 for token in doc:
    print (token.pos_, token.tag_)