diff --git a/generate_random_from_file.py b/generate_random_from_file.py new file mode 100644 index 0000000..6b4aad4 --- /dev/null +++ b/generate_random_from_file.py @@ -0,0 +1,31 @@ +import nltk +import random +import string +import sys + + +def main(text): + bigrams = list(nltk.bigrams( + [token for token in nltk.word_tokenize(text.decode('utf8')) + if set(token).difference(set(string.punctuation))])) + cfdist = nltk.ConditionalFreqDist(bigrams) + word = random.choice(bigrams)[0] + for i in range(155): + print word, + if i % 3: + top_words = tuple(cfdist[word]) + else: + dist = cfdist[word].copy() + top_words = [] + for i in range(3): + if dist: + top_words.append(dist.max()) + del dist[top_words[-1]] + else: + break + word = random.choice(top_words) + +if __name__ == '__main__': + file = sys.argv[1] + with open(file, 'r') as f: + main(f.read()) diff --git a/notes.md b/notes.md new file mode 100644 index 0000000..8319fc0 --- /dev/null +++ b/notes.md @@ -0,0 +1,13 @@ +What needs to be improved about this repo: + +Generalize and standardize the steps in an NLP pipeline into python classes and +functions. I can think of these off the top of my head: + +* Scraper - get text from the internet to local file +* Cleaner - clean raw text of non-corpus text +* Ngramer - assemble text in python list of lists +* Cfdister - restructure data into a conditional frequency distribution +* Other? - restructure data by other metric (rhyming, similarity, etc.) +* Assembler loop - takes structure above and outputs one word + - Maybe should wrap in a sentence loop, line-by-line loop, paragraph loop, + etc. diff --git a/spacy_gen.py b/spacy_gen.py new file mode 100644 index 0000000..fbd2eaa --- /dev/null +++ b/spacy_gen.py @@ -0,0 +1,7 @@ +import spacy + +nlp = spacy.load('en') +doc = nlp(u'They told us to duck.') + +for token in doc: + print (token.pos_, token.tag_)