diff --git a/edX Lightning Talk.ipynb b/edX Lightning Talk.ipynb index ed4afb9..03832e3 100644 --- a/edX Lightning Talk.ipynb +++ b/edX Lightning Talk.ipynb @@ -296,8 +296,17 @@ "source": [ "How do we know which one to pick as the next word?\n", "\n", - "Why not the word that occurred the most often after the condition in the corpus?\n", - "\n", + "Why not the word that occurred the most often after the condition in the corpus?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ "We can use a **Conditional Frequency Distribution (CFD)** to figure that out!\n", "\n", "A **CFD** can tell us: given a **condition**, what is **likely** to follow?" @@ -332,7 +341,8 @@ } ], "source": [ - "words = 'The quick brown fox jumped over the lazy dog and the quick cat'.split(' ')\n", + "words = ('The quick brown fox jumped over the '\n", + " 'lazy dog and the quick cat').split(' ')\n", "print words" ] }, @@ -349,8 +359,18 @@ "source": [ "from collections import defaultdict\n", "\n", - "cfd = defaultdict(lambda: defaultdict(lambda: 0))\n", - "condition = 'the'" + "cfd = defaultdict(lambda: defaultdict(lambda: 0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Conditional Frequency Distributions (CFDs) ##" ] }, { @@ -365,7 +385,15 @@ { "data": { "text/plain": [ - "{'the': {'lazy': 1, 'quick': 2}}" + "{'and': {'the': 1},\n", + " 'brown': {'fox': 1},\n", + " 'dog': {'and': 1},\n", + " 'fox': {'jumped': 1},\n", + " 'jumped': {'over': 1},\n", + " 'lazy': {'dog': 1},\n", + " 'over': {'the': 1},\n", + " 'quick': {'brown': 1},\n", + " 'the': {'lazy': 1, 'quick': 2}}" ] }, "execution_count": 8, @@ -374,11 +402,10 @@ } ], "source": [ - "for i in range(len(words) - 2):\n", - " if words[i].lower() == condition:\n", - " cfd[condition][words[i+1]] += 1\n", + "for i in range(len(words) - 2): # loop to the next-to-last word\n", + " cfd[words[i].lower()][words[i+1].lower()] += 1\n", "\n", - "# pretty print the defaultdict \n", + "# pretty print the defaultdict\n", "{k: dict(v) for k, v in dict(cfd).items()}" ] }, @@ -386,11 +413,11 @@ "cell_type": "markdown", "metadata": { "slideshow": { - "slide_type": "slide" + "slide_type": "fragment" } }, "source": [ - "## What's the most likely? ##" + "So, what's the most likely word to follow `'the'`?" ] }, { @@ -414,7 +441,7 @@ } ], "source": [ - "max(cfd[condition])" + "max(cfd['the'])" ] }, { @@ -447,7 +474,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 31, "metadata": { "slideshow": { "slide_type": "fragment" @@ -458,7 +485,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "must therefore that half ago for hope that occasion , Perry -- abundance about ten\n" + "her reserve and concealment towards some feelings in moving slowly together . You will shew\n" ] } ], @@ -599,7 +626,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 30, "metadata": { "slideshow": { "slide_type": "fragment" @@ -612,22 +639,16 @@ "text": [ "(SBARQ\n", " (SQ\n", - " (NP (PRP she))\n", - " (VP\n", - " (VBD was)\n", - " (VBN obliged)\n", - " (S+VP (TO to) (VP (VB stop) (CC and) (VB think)))))\n", + " (NP (PRP I))\n", + " (VP (VBP do) (RB not) (VB advise) (NP (DT the) (NN custard))))\n", " (. .))\n", - "she was obliged to stop and think .\n", + "I do not advise the custard .\n", "==============================\n", - "They was hacked to amp ; support !\n", + "I do n't want the drone !\n", "(SBARQ\n", " (SQ\n", - " (NP (PRP They))\n", - " (VP\n", - " (VBD was)\n", - " (VBN hacked)\n", - " (S+VP (TO to) (VP (VB amp) (CC ;) (VB support)))))\n", + " (NP (PRP I))\n", + " (VP (VBP do) (RB n't) (VB want) (NP (DT the) (NN drone))))\n", " (. !))\n" ] } @@ -637,7 +658,7 @@ "\n", "# inserts matching syntax subtrees from trump.txt into\n", "# trees from austen-emma.txt\n", - "generate('trump.txt', word_limit=15)" + "generate('trump.txt', word_limit=10)" ] }, {