diff --git a/edX Lightning Talk.ipynb b/edX Lightning Talk.ipynb new file mode 100644 index 0000000..4ea9879 --- /dev/null +++ b/edX Lightning Talk.ipynb @@ -0,0 +1,744 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Generating random poems with Python #\n", + "\n", + "\n", + "
(I never said they would be good poems)
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Phone autocomplete ##\n", + "\n", + "You can generate random text that sounds like you with your smartphone keyboard:\n", + "\n", + "
![Smartphone keyboard](images/phone_keyboard.png)
\n", + "
![Smartphone_autocomplete](images/phone_autocomplete.gif)
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## So, how does it work? ##\n", + "\n", + "First, we need a **corpus**, or the text our generator will recombine into new sentences:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "corpus = 'The quick brown fox jumps over the lazy dog'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Simplest word **tokenization** is to split on spaces:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "words = corpus.split(' ')\n", + "words" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "To create **bigrams**, iterate through the list of words with two indicies, one of which is offset by one:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('The', 'quick'),\n", + " ('quick', 'brown'),\n", + " ('brown', 'fox'),\n", + " ('fox', 'jumps'),\n", + " ('jumps', 'over'),\n", + " ('over', 'the'),\n", + " ('the', 'lazy'),\n", + " ('lazy', 'dog')]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bigrams = [b for b in zip(words[:-1], words[1:])]\n", + "bigrams" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "How do we use the bigrams to predict the next word given the first word?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + " Return every second element where the first element matches the **condition**:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['quick', 'lazy']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "condition = 'the'\n", + "next_words = [bigram[1] for bigram in bigrams\n", + " if bigram[0].lower() == condition]\n", + "next_words" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "(The quick) (quick brown) ... (the lazy) (lazy dog)\n", + "\n", + "Either “quick” or “lazy” could be the next word." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Trigrams and Ngrams ##\n", + "\n", + "We can partition by threes too:\n", + "\n", + "(The quick brown) (quick brown fox) ... (the lazy dog)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "Or, the condition can be two words (`condition = 'the lazy'`):\n", + "\n", + "(The quick brown) (quick brown fox) ... (the lazy dog)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "\n", + "These are **trigrams**." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "We can partition any **N** number of words together as **ngrams**." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "So earlier we got:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['quick', 'lazy']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next_words" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "How do we know which one to pick as the next word?\n", + "\n", + "Why not the word that occurred the most often after the condition in the corpus?\n", + "\n", + "We can use a **Conditional Frequency Distribution (CFD)** to figure that out!\n", + "\n", + "A **CFD** can tell us: given a **condition**, what is **likely** to follow?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Conditional Frequency Distributions (CFDs) ##" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', 'and', 'the', 'quick', 'cat']\n" + ] + } + ], + "source": [ + "words = 'The quick brown fox jumped over the lazy dog and the quick cat'.split(' ')\n", + "print words" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "cfd = defaultdict(lambda: defaultdict(lambda: 0))\n", + "condition = 'the'" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'the': {'lazy': 1, 'quick': 2}}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for i in range(len(words) - 2):\n", + " if words[i].lower() == condition:\n", + " cfd[condition][words[i+1]] += 1\n", + "\n", + "# pretty print the defaultdict \n", + "{k: dict(v) for k, v in dict(cfd).items()}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## What's the most likely? ##" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'quick'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max(cfd[condition])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Whole sentences can be the conditions and values too ##\n", + "\n", + "Which is basically the way cleverbot works:\n", + "\n", + "![Cleverbot](images/cleverbot.png)\n", + "\n", + "[http://www.cleverbot.com/](http://www.cleverbot.com/)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Random text! ##" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "must therefore that half ago for hope that occasion , Perry -- abundance about ten\n" + ] + } + ], + "source": [ + "import nltk\n", + "import random\n", + "\n", + "TEXT = nltk.corpus.gutenberg.words('austen-emma.txt')\n", + "\n", + "# NLTK shortcuts :)\n", + "bigrams = nltk.bigrams(TEXT)\n", + "cfd = nltk.ConditionalFreqDist(bigrams)\n", + "\n", + "# pick a random word from the corpus to start with\n", + "word = random.choice(TEXT)\n", + "# generate 15 more words\n", + "for i in range(15):\n", + " print word,\n", + " if word in cfd:\n", + " word = random.choice(cfd[word].keys())\n", + " else:\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Random poems ##\n", + "\n", + "Generating random poems is simply limiting the choice of the next word by some constraint:\n", + "\n", + "* words that rhyme with the previous line\n", + "* words that match a certain syllable count\n", + "* words that alliterate with words on the same line\n", + "* etc." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "![Buzzfeed Haiku Generator](images/buzzfeed.png)\n", + "\n", + "[http://mule.hallada.net/nlp/buzzfeed-haiku-generator/](http://mule.hallada.net/nlp/buzzfeed-haiku-generator/)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Remember these? ##\n", + "\n", + "![madlibs](images/madlibs.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "\n", + "These worked so well because they forced the random words (chosed by you) to fit into the syntactical structure and parts-of-speech of an existing sentence.\n", + "\n", + "You end up with **syntactically** correct sentences that are **semantically** random.\n", + "\n", + "We can do the same thing!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## NLTK Syntax Trees! ##" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(S\n", + " (NP (DT the) (NN quick))\n", + " (VP\n", + " (VB brown)\n", + " (NP\n", + " (NP (JJ fox) (NN jumps))\n", + " (PP (IN over) (NP (DT the) (JJ lazy) (NN dog)))))\n", + " (. .))\n" + ] + } + ], + "source": [ + "from stat_parser import Parser\n", + "parser = Parser()\n", + "print parser.parse('The quick brown fox jumps over the lazy dog.')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Swaping matching syntax subtrees between two corpora ##" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(SBARQ\n", + " (SQ\n", + " (NP (PRP she))\n", + " (VP\n", + " (VBD was)\n", + " (VBN obliged)\n", + " (S+VP (TO to) (VP (VB stop) (CC and) (VB think)))))\n", + " (. .))\n", + "she was obliged to stop and think .\n", + "==============================\n", + "They was hacked to amp ; support !\n", + "(SBARQ\n", + " (SQ\n", + " (NP (PRP They))\n", + " (VP\n", + " (VBD was)\n", + " (VBN hacked)\n", + " (S+VP (TO to) (VP (VB amp) (CC ;) (VB support)))))\n", + " (. !))\n" + ] + } + ], + "source": [ + "from syntax_aware_generate import generate\n", + "\n", + "# inserts matching syntax subtrees from trump.txt into\n", + "# trees from austen-emma.txt\n", + "generate('trump.txt', word_limit=15)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## spaCy ##\n", + "\n", + "![spaCy speed comparison](images/spacy_speed.png)\n", + "\n", + "[https://spacy.io/docs/api/#speed-comparison](https://spacy.io/docs/api/#speed-comparison)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Character-based Recurrent Neural Networks ##\n", + "\n", + "![RNN Paper](images/rnn_paper.png)\n", + "\n", + "[http://www.cs.utoronto.ca/~ilya/pubs/2011/LANG-RNN.pdf](http://www.cs.utoronto.ca/~ilya/pubs/2011/LANG-RNN.pdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Implementation: char-rnn ##\n", + "\n", + "![char-rnn](images/char-rnn.png)\n", + "\n", + "[https://github.com/karpathy/char-rnn](https://github.com/karpathy/char-rnn)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Generating Shakespeare with char-rnn ##\n", + "\n", + "![Shakespeare](images/shakespeare.png)\n", + "\n", + "[http://karpathy.github.io/2015/05/21/rnn-effectiveness/](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# The end #\n", + "\n", + "Questions?" + ] + } + ], + "metadata": { + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11+" + }, + "livereveal": { + "scroll": true, + "theme": "simple", + "transition": "linear" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/generate_poem.py b/generate_poem.py index 93689df..106cc27 100644 --- a/generate_poem.py +++ b/generate_poem.py @@ -13,7 +13,7 @@ from count_syllables import count_syllables class PoemGenerator(): - def __init__(self, corpus): + def __init__(self): #self.corpus = 'melville-moby_dick.txt' #self.corpus = read_titles() #self.sents = corpus.sents(self.corpus) @@ -71,7 +71,7 @@ class PoemGenerator(): else: print('') - def generate_poem(self): + def generate_text(self): #sent = random.choice(self.sents) #parsed = self.parser.parse(' '.join(sent)) word = random.choice(self.bigrams)[0] @@ -139,7 +139,7 @@ class PoemGenerator(): if __name__ == '__main__': - generator = PoemGenerator('poop') + generator = PoemGenerator() #generator.generate_poem() haiku = generator.generate_haiku() print haiku diff --git a/images/buzzfeed.png b/images/buzzfeed.png new file mode 100644 index 0000000..c31f115 Binary files /dev/null and b/images/buzzfeed.png differ diff --git a/images/char-rnn.png b/images/char-rnn.png new file mode 100644 index 0000000..847bf46 Binary files /dev/null and b/images/char-rnn.png differ diff --git a/images/cleverbot.png b/images/cleverbot.png new file mode 100644 index 0000000..8f6ba5c Binary files /dev/null and b/images/cleverbot.png differ diff --git a/images/madlibs.png b/images/madlibs.png new file mode 100644 index 0000000..6a4569f Binary files /dev/null and b/images/madlibs.png differ diff --git a/images/phone_autocomplete.gif b/images/phone_autocomplete.gif new file mode 100644 index 0000000..8f45c2b Binary files /dev/null and b/images/phone_autocomplete.gif differ diff --git a/images/phone_keyboard.png b/images/phone_keyboard.png new file mode 100644 index 0000000..51e74ce Binary files /dev/null and b/images/phone_keyboard.png differ diff --git a/images/rnn_paper.png b/images/rnn_paper.png new file mode 100644 index 0000000..808bc94 Binary files /dev/null and b/images/rnn_paper.png differ diff --git a/images/shakespeare.png b/images/shakespeare.png new file mode 100644 index 0000000..fd1b601 Binary files /dev/null and b/images/shakespeare.png differ diff --git a/images/spacy_speed.png b/images/spacy_speed.png new file mode 100644 index 0000000..c6b9d38 Binary files /dev/null and b/images/spacy_speed.png differ diff --git a/syntax_aware_generate.py b/syntax_aware_generate.py index 7958f0d..4094d36 100644 --- a/syntax_aware_generate.py +++ b/syntax_aware_generate.py @@ -29,7 +29,7 @@ Tree.__hash__ = tree_hash # corpora. Shitty bus wifi makes it hard to download spacy data and look up the docs. -def generate(filename): +def generate(filename, word_limit=None): global syntaxes parser = Parser() if not os.path.exists(SYNTAXES_FILE): @@ -37,7 +37,10 @@ def generate(filename): # NOTE: results.txt is a big file of raw text not included in source control, provide your own corpus. with codecs.open(filename, encoding='utf-8') as corpus: sents = nltk.sent_tokenize(corpus.read()) - sents = [sent for sent in sents if len(sent) < 150][0:1500] + if word_limit: + sents = [sent for sent in sents if len(sent) < word_limit] + sent_limit = min(1500, len(sents)) + sents[0:sent_limit] for sent in tqdm(sents): try: parsed = parser.parse(sent) @@ -60,7 +63,8 @@ def generate(filename): cfds = pickle.load(pickle_file) sents = nltk.corpus.gutenberg.sents('austen-emma.txt') - sents = [sent for sent in sents if len(sent) < 50] + if word_limit: + sents = [sent for sent in sents if len(sent) < word_limit] sent = random.choice(sents) parsed = parser.parse(' '.join(sent)) print(parsed)