Pushing work from mule
This commit is contained in:
parent
3ace25b6e2
commit
19aefd163f
31
generate_random_from_file.py
Normal file
31
generate_random_from_file.py
Normal file
@ -0,0 +1,31 @@
|
||||
import nltk
|
||||
import random
|
||||
import string
|
||||
import sys
|
||||
|
||||
|
||||
def main(text):
|
||||
bigrams = list(nltk.bigrams(
|
||||
[token for token in nltk.word_tokenize(text.decode('utf8'))
|
||||
if set(token).difference(set(string.punctuation))]))
|
||||
cfdist = nltk.ConditionalFreqDist(bigrams)
|
||||
word = random.choice(bigrams)[0]
|
||||
for i in range(155):
|
||||
print word,
|
||||
if i % 3:
|
||||
top_words = tuple(cfdist[word])
|
||||
else:
|
||||
dist = cfdist[word].copy()
|
||||
top_words = []
|
||||
for i in range(3):
|
||||
if dist:
|
||||
top_words.append(dist.max())
|
||||
del dist[top_words[-1]]
|
||||
else:
|
||||
break
|
||||
word = random.choice(top_words)
|
||||
|
||||
if __name__ == '__main__':
|
||||
file = sys.argv[1]
|
||||
with open(file, 'r') as f:
|
||||
main(f.read())
|
13
notes.md
Normal file
13
notes.md
Normal file
@ -0,0 +1,13 @@
|
||||
What needs to be improved about this repo:
|
||||
|
||||
Generalize and standardize the steps in an NLP pipeline into python classes and
|
||||
functions. I can think of these off the top of my head:
|
||||
|
||||
* Scraper - get text from the internet to local file
|
||||
* Cleaner - clean raw text of non-corpus text
|
||||
* Ngramer - assemble text in python list of lists
|
||||
* Cfdister - restructure data into a conditional frequency distribution
|
||||
* Other? - restructure data by other metric (rhyming, similarity, etc.)
|
||||
* Assembler loop - takes structure above and outputs one word
|
||||
- Maybe should wrap in a sentence loop, line-by-line loop, paragraph loop,
|
||||
etc.
|
7
spacy_gen.py
Normal file
7
spacy_gen.py
Normal file
@ -0,0 +1,7 @@
|
||||
import spacy
|
||||
|
||||
nlp = spacy.load('en')
|
||||
doc = nlp(u'They told us to duck.')
|
||||
|
||||
for token in doc:
|
||||
print (token.pos_, token.tag_)
|
Loading…
Reference in New Issue
Block a user