Pushing work from mule
This commit is contained in:
parent
3ace25b6e2
commit
19aefd163f
31
generate_random_from_file.py
Normal file
31
generate_random_from_file.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import nltk
|
||||||
|
import random
|
||||||
|
import string
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def main(text):
|
||||||
|
bigrams = list(nltk.bigrams(
|
||||||
|
[token for token in nltk.word_tokenize(text.decode('utf8'))
|
||||||
|
if set(token).difference(set(string.punctuation))]))
|
||||||
|
cfdist = nltk.ConditionalFreqDist(bigrams)
|
||||||
|
word = random.choice(bigrams)[0]
|
||||||
|
for i in range(155):
|
||||||
|
print word,
|
||||||
|
if i % 3:
|
||||||
|
top_words = tuple(cfdist[word])
|
||||||
|
else:
|
||||||
|
dist = cfdist[word].copy()
|
||||||
|
top_words = []
|
||||||
|
for i in range(3):
|
||||||
|
if dist:
|
||||||
|
top_words.append(dist.max())
|
||||||
|
del dist[top_words[-1]]
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
word = random.choice(top_words)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
file = sys.argv[1]
|
||||||
|
with open(file, 'r') as f:
|
||||||
|
main(f.read())
|
13
notes.md
Normal file
13
notes.md
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
What needs to be improved about this repo:
|
||||||
|
|
||||||
|
Generalize and standardize the steps in an NLP pipeline into python classes and
|
||||||
|
functions. I can think of these off the top of my head:
|
||||||
|
|
||||||
|
* Scraper - get text from the internet to local file
|
||||||
|
* Cleaner - clean raw text of non-corpus text
|
||||||
|
* Ngramer - assemble text in python list of lists
|
||||||
|
* Cfdister - restructure data into a conditional frequency distribution
|
||||||
|
* Other? - restructure data by other metric (rhyming, similarity, etc.)
|
||||||
|
* Assembler loop - takes structure above and outputs one word
|
||||||
|
- Maybe should wrap in a sentence loop, line-by-line loop, paragraph loop,
|
||||||
|
etc.
|
7
spacy_gen.py
Normal file
7
spacy_gen.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
import spacy
|
||||||
|
|
||||||
|
nlp = spacy.load('en')
|
||||||
|
doc = nlp(u'They told us to duck.')
|
||||||
|
|
||||||
|
for token in doc:
|
||||||
|
print (token.pos_, token.tag_)
|
Loading…
Reference in New Issue
Block a user