Commit 6b36d160 authored by Loïc Barrault's avatar Loïc Barrault
Browse files

Added lab shannon

parent 3e6380c7
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
{
"cells": [
{
"cell_type": "markdown",
"id": "fifty-representation",
"metadata": {},
"source": [
"# This lab consists in implementing the basics of Language Modeling and recreate Shannon and Weaver's experiment on generating characters and words sequences."
]
},
{
"cell_type": "code",
"execution_count": 281,
"id": "minor-acting",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from collections import Counter\n",
"\n",
"np.random.seed()\n",
"\n",
"BOS='<s>'\n",
"EOS='</s>'\n",
"UNK='<unk>'\n"
]
},
{
"cell_type": "markdown",
"id": "eight-magnitude",
"metadata": {},
"source": [
"# Character models"
]
},
{
"cell_type": "code",
"execution_count": 170,
"id": "speaking-impact",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Vocab:abcdefghijklmnopqrstuvwxyz [27]\n"
]
}
],
"source": [
"vocab=\"abcdefghijklmnopqrstuvwxyz \"\n",
"vocab_array = list(vocab)\n",
"vocab_len = len(vocab)\n",
"print(\"Vocab:{} [{}]\".format(vocab, vocab_len))"
]
},
{
"cell_type": "markdown",
"id": "understanding-monthly",
"metadata": {},
"source": [
"## Uniform distribution"
]
},
{
"cell_type": "code",
"execution_count": 177,
"id": "essential-income",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tlmyyqtpyswslcpjecwfptkjkf lmhmsppkctqhwjlonwfjwqnfrgwjeviahfylccgiergnuchuwiceijmybytvuzfzlxnlqimad"
]
}
],
"source": [
"txt_len = 100\n",
"\n",
"probabilities = [1/vocab_len]*vocab_len\n",
"s = np.random.choice(vocab_array, size=txt_len, replace=True, p=probabilities)\n",
"\n",
"\n",
"#s = np.random.randint(0,vocab_len,txt_len)\n",
"for i in s:\n",
" print(\"{}\".format(i), end='')"
]
},
{
"cell_type": "markdown",
"id": "operational-beginning",
"metadata": {},
"source": [
"## Following letter frequency in English\n",
"See e.g. https://en.wikipedia.org/wiki/Letter_frequency and http://norvig.com/mayzner.html and http://www.fitaly.com/board/domper3/posts/136.html"
]
},
{
"cell_type": "code",
"execution_count": 219,
"id": "residential-marketing",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"toi pupaeogtl esietc t eofuhnaotnlh ancknfp v daeftdaocrddlnyocwnno aetagrarha ethpsphopo nlsitkaht"
]
}
],
"source": [
"probabilities = [0.075, 0.01, 0.02, 0.04, 0.11, 0.02, 0.01, \n",
" 0.055, 0.06, 0.001, 0.007, 0.03, 0.02, 0.06, \n",
" 0.065, 0.014, 0.00095, 0.055, 0.06, 0.09, 0.02, \n",
" 0.008, 0.02, 0.001, 0.002, 0.0007, 0.14534999999999984]\n",
"\n",
"s = np.random.choice(vocab_array, size=txt_len, replace=True, p=probabilities)\n",
"for i in s:\n",
" print(\"{}\".format(i), end='')\n"
]
},
{
"cell_type": "markdown",
"id": "controlling-tuesday",
"metadata": {},
"source": [
"## Using a corpus - Sherlock Holmes novel"
]
},
{
"cell_type": "code",
"execution_count": 276,
"id": "assisted-burden",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"COUNTS: Counter({' ': 95681, 'e': 53169, 't': 39034, 'a': 35159, 'o': 33536, 'i': 30156, 'h': 29077, 'n': 28682, 's': 27192, 'r': 24547, 'd': 18540, 'l': 17166, 'u': 13099, '\\n': 11944, 'm': 11798, 'w': 11274, 'c': 10522, 'y': 9445, 'f': 8986, 'g': 7906, ',': 7662, 'p': 6806, 'b': 6378, '.': 6211, 'v': 4455, 'k': 3551, '“': 2764, '”': 2325, '’': 1051, '-': 741, '?': 738, 'x': 549, 'j': 458, '‘': 434, 'q': 427, '!': 346, ';': 202, '—': 191, 'z': 150, '_': 142, '0': 86, '1': 65, ':': 62, '2': 39, '8': 38, '£': 37, '4': 22, '7': 18, '6': 16, '5': 16, '9': 15, '3': 15, 'é': 12, '&': 7, '*': 6, 'æ': 6, '(': 5, ')': 5, 'œ': 2, \"'\": 1, '[': 1, '#': 1, ']': 1, '½': 1, 'à': 1, 'â': 1, 'è': 1}) [67]\n",
"VOCABULARY: ['d', ',', '—', 'f', '4', 'u', 'w', '“', 'o', '!', 's', '_', '‘', 'e', 'k', '?', '”', '3', '9', 'x', 'i', 'y', 't', '2', '-', '8', '’', '1', '6', 'm', 'h', 'p', 'r', '0', '.', ':', 'b', 'c', '£', 'g', 'é', 'j', 'n', 'v', '\\n', 'q', ';', 'l', '5', '7', 'z', ' ', 'a'] [53]\n",
"PROBABILITIES: [0.03293453062964641, 0.013610807642090116, 0.0003392931688383206, 0.015962766571629053, 3.908088855729347e-05, 0.0232691163278176, 0.02002717898158757, 0.004909980726016325, 0.0595734853935179, 0.0006146357927647064, 0.04830397825681473, 0.00025224937159707604, 0.0007709593469938803, 0.09444962562285165, 0.006308010693952233, 0.0013109861706946627, 0.004130139358895787, 2.664606037997282e-05, 2.664606037997282e-05, 0.0009752458099070053, 0.053569239787897356, 0.01677813601925622, 0.0693401547247906, 6.927975698792933e-05, 0.0013163153827706574, 6.750335296259781e-05, 0.001867000630623429, 0.00011546626164654889, 2.8422464405304343e-05, 0.02095801469086129, 0.051652499844564645, 0.012090205796406335, 0.043605389609812854, 0.00015277074617851084, 0.01103324540133408, 0.00011013704957055433, 0.011329904873564443, 0.018691323154538267, 6.572694893726629e-05, 0.014044250224271007, 2.1316848303978258e-05, 0.0008135930436018368, 0.050950820254558694, 0.007913879932851928, 0.02121736967855969, 0.0007585245188165596, 0.0003588336131169673, 0.030493751498840895, 2.8422464405304343e-05, 3.1975272455967385e-05, 0.0002664606037997282, 0.1699681135477453, 0.06245658912663096]\n",
"’ya gfy,ai?e eeprhlo tiek f“ rl \n",
" r reoenn ’ eayeeac aa otpwrrmc etlcra heprlab sfyianntrsu.ounhimh"
]
}
],
"source": [
"# Read the Sherlock Holmes novel\n",
"filename='TheAdventuresOfSherlockHolmes.txt'\n",
"with open(filename, 'rt') as fd:\n",
" text = list(fd.read().lower())\n",
"\n",
"# Get character counts and frequencies\n",
"counts=Counter(text)\n",
"print(\"COUNTS:\", counts, '[{}]'.format(len(counts)))\n",
"\n",
"# Create vocabulary woth characters occuring\n",
"#vocabulary = set(text_array) # Use all characters in vocabulary\n",
"vocabulary = [ x for x in set(text) if counts[x] > 10 ] # Use characters appearing more than N times\n",
"print(\"VOCABULARY:\", vocabulary, '[{}]'.format(len(vocabulary)))\n",
"\n",
"# Get probabilities by relative frequency\n",
"# Get the denominator\n",
"sum_counts = sum([c for e,c in counts.items() if e in vocabulary])\n",
"# Compute the probabilities\n",
"probabilities = [counts[e]/sum_counts for e in vocabulary]\n",
"print(\"PROBABILITIES:\", probabilities)\n",
"\n",
"# Sampling from the distribution\n",
"s = np.random.choice(vocabulary, size=txt_len, replace=True, p=probabilities)\n",
"for i in s:\n",
" print(\"{}\".format(i), end='')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "worth-quarterly",
"metadata": {},
"outputs": [],
"source": [
"# Get the counts for a certain order\n",
"# Smoothing: value betwwen 0 and 1 to smooth model\n",
"def count_order(text, vocabulary, order=1, smoothing=0):\n",
" assert smoothing >= 0 and smoothing <= 1\n",
" # Get counts\n",
" counts = {}\n",
" prevw = None\n",
" for w in text_tok:\n",
" if w in unigram_counts:\n",
" unigram_counts[w] += 1\n",
" else:\n",
" unigram_counts[w] = 1\n",
" if prevw is not None:\n",
" if prevw not in bigram_counts:\n",
" bigram_counts[prevw] = {}\n",
" if w in bigram_counts[prevw]:\n",
" bigram_counts[prevw][w] += 1\n",
" else:\n",
" bigram_counts[prevw][w] = 1\n",
" if w == EOS:\n",
" prevw = None\n",
" else:\n",
" prevw = w\n",
" "
]
},
{
"cell_type": "markdown",
"id": "stuck-cement",
"metadata": {},
"source": [
"# Word level bigram model"
]
},
{
"cell_type": "code",
"execution_count": 304,
"id": "continuous-flesh",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"text sent: [\"Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle\\n\\nThis eBook is for the use of anyone anywhere at no cost and with\\nalmost no restrictions whatsoever.\", 'You may copy it, give it away or\\nre-use it under the terms of the Project Gutenberg License included\\nwith this eBook or online at www.gutenberg.org\\n\\n\\nTitle: The Adventures of Sherlock Holmes\\n\\nAuthor: Arthur Conan Doyle\\n\\nRelease Date: November 29, 2002 [EBook #1661]\\nLast Updated: May 20, 2019\\n\\nLanguage: English\\n\\nCharacter set encoding: UTF-8\\n\\n*** START OF THIS PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES ***\\n\\n\\n\\nProduced by an anonymous Project Gutenberg volunteer and Jose Menendez\\n\\n\\n\\ncover\\n\\n\\n\\nThe Adventures of Sherlock Holmes\\n\\n\\n\\nby Arthur Conan Doyle\\n\\n\\n\\nContents\\n\\n\\n I.', 'A Scandal in Bohemia\\n II.', 'The Red-Headed League\\n III.', 'A Case of Identity\\n IV.', 'The Boscombe Valley Mystery\\n V. The Five Orange Pips\\n VI.', 'The Man with the Twisted Lip\\n VII.', 'The Adventure of the Blue Carbuncle\\n VIII.', 'The Adventure of the Speckled Band\\n IX.', 'The Adventure of the Engineer’s Thumb\\n X.']\n",
"VOCAB: ['that', 'friends', 'someone', 'gun', 'same', 'terribly', 'circle', 'handed', 'shook', 'conclusions'] [2570]\n",
"TOTAL_WORDS: 134698\n",
"UNIGRAM_PROBS: [0.035026503734279645, 2.9696060817532554e-05, 2.9696060817532554e-05, 0.07469301697129875, 0.0025538612303077995, 2.9696060817532554e-05, 0.019502887941914505, 0.0007275534900295476, 0.003377926917994328, 0.056838260404757306]\n",
" ---- UNIFORM </s>\n"
]
}
],
"source": [
"# Read the Sherlock Holmes novel\n",
"filename='TheAdventuresOfSherlockHolmes.txt'\n",
"with open(filename, 'rt') as fd:\n",
" #text = fd.read().lower()\n",
" text = fd.read()\n",
"\n",
"# Tokenize sentences and words\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.tokenize import sent_tokenize\n",
"text_sent = sent_tokenize(text)\n",
"print(\"text sent:\", text_sent[:10])\n",
"\n",
"# Preprend BOS and append EOS\n",
"text_tok = []\n",
"for s in text_sent:\n",
" words = word_tokenize(s)\n",
" words.insert(0, BOS)\n",
" words.append(EOS)\n",
" text_tok.append(words)\n",
"\n",
"text_tok = [word for sentence in text_tok for word in sentence]\n",
"#print(\"text tok:\", text_tok[:100])\n",
"\n",
"# Get character counts and frequencies\n",
"counts=Counter(text_tok)\n",
"#print(\"COUNTS:\", counts, '[{}]'.format(len(counts)))\n",
"\n",
"#vocabulary = list(set(text_tok)) # Use all words\n",
"vocabulary = [ x for x in set(text_tok) if counts[x] > 3 ] # Use characters appearing more than N times\n",
"print(\"VOCAB:\", vocabulary[:10], '[{}]'.format(len(vocabulary)))\n",
"index2vocab = {w:i for (i,w) in enumerate(vocabulary)}\n",
"\n",
"\n",
"# Update the text, map unknown words to unk\n",
"text_tok = [word if word in vocabulary else UNK for word in text_tok]\n",
"#print(\"text tok:\", text_tok[:100])\n",
"\n",
"# Get word counts and frequencies\n",
"unigram_counts = {}\n",
"bigram_counts = {}\n",
"prevw = None\n",
"for w in text_tok:\n",
" if w in unigram_counts:\n",
" unigram_counts[w] += 1\n",
" else:\n",
" unigram_counts[w] = 1\n",
" if prevw is not None:\n",
" if prevw not in bigram_counts:\n",
" bigram_counts[prevw] = {}\n",
" if w in bigram_counts[prevw]:\n",
" bigram_counts[prevw][w] += 1\n",
" else:\n",
" bigram_counts[prevw][w] = 1\n",
" if w == EOS:\n",
" prevw = None\n",
" else:\n",
" prevw = w\n",
" \n",
"total_words = np.sum([unigram_counts[k] for k in unigram_counts.keys()])\n",
"print(\"TOTAL_WORDS:\", total_words) \n",
"unigram_probs = [unigram_counts[k]/total_words for k in unigram_counts.keys()]\n",
"print(\"UNIGRAM_PROBS:\", unigram_probs[:10]) \n",
"bigram_probs = {} \n",
"\n",
"# Get probabilities by relative frequency\n",
"# p(w2|w1) = C(w1 w2)/C(w1)\n",
"for w1 in vocabulary:\n",
" if w1 in bigram_counts:\n",
" bigram_probs[w1] = []\n",
" for w2 in vocabulary:\n",
" if w2 in bigram_counts[w1]:\n",
" bigram_probs[w1].append(bigram_counts[w1][w2]/unigram_counts[w1])\n",
" else: # never seen w1 w2 in the corpus -> use the unigram prob for now\n",
" bigram_probs[w1].append(unigram_probs[index2vocab[w2]])\n",
" else: # w1 never started a bigram, should be the case for EOS, use uniform distributions\n",
" bigram_probs[w1] = [1/len(vocabulary)]*len(vocabulary)\n",
" print(' ---- UNIFORM {}'.format(w1))\n",
" # let's renormalize everything here\n",
" bigram_probs[w1] = bigram_probs[w1]/np.sum(bigram_probs[w1])\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "characteristic-lighting",
"metadata": {},
"source": [
"# Sample the bigram model"
]
},
{
"cell_type": "code",
"execution_count": 290,
"id": "generous-finance",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MY STORY: holmes \n",
"walked slowly , servants gun see. ” “ ‘ jones better proceed to remember were point , corner of it is passion also , surprised received hudson has motive bundle in coming from stepped up from unable to sitting by the passed imagine. inquest ones dozen forced to cry envelope gun same monday , gun sleeves thinking palm of a door ones empty fourteen ones violet conclusions link passed searched sound corner liberty . deeper still between them season bachelor , gun as it to answer same out of to-day had lens , mr. wilson chill to smoke same by "
]
}
],
"source": [
"# Sample from unigram probability\n",
"#prevw = np.random.choice(vocabulary, replace=True, p=unigram_probs)\n",
"prevw = 'holmes'\n",
"print(\"MY STORY: \", prevw, ' ')\n",
"\n",
"for i in range(txt_len):\n",
" #print(\"SUM: \", np.sum(bigram_probs[prevw]))\n",
" prevw = np.random.choice(vocabulary, replace=True, p=bigram_probs[prevw])\n",
" print(\"{}\".format(prevw), end=' ')"
]
},
{
"cell_type": "code",
"execution_count": 306,
"id": "contemporary-entrance",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2.22734512e-05 3.34101767e-05 4.89826828e-01 1.48489674e-05] [2570]\n"
]
}
],
"source": [
"w='Sherlock'\n",
"i=index2vocab['Holmes']\n",
"print(bigram_probs[w][i-2:i+2], \"[{}]\".format(len(bigram_probs[w])))"
]
},
{
"cell_type": "code",
"execution_count": 307,
"id": "spectacular-boring",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.00011353731279482134\n",
"0.013061533049175319\n",
"0.05008123914797422\n",
"7.426904349923723e-08\n"
]
}
],
"source": [
"s='my dear Watson'.split()\n",
"prevw=BOS\n",
"prob=1.0\n",
"for w in s:\n",
" p = bigram_probs[prevw][index2vocab[w]]\n",
" print(p)\n",
" prob *= p\n",
" prevw = w\n",
"print(prob)"
]
},
{
"cell_type": "code",
"execution_count": 308,
"id": "relative-vehicle",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.00011353731279482134\n",
"0.013061533049175319\n",
"0.04006499131837938\n",
"5.941523479938979e-08\n"
]
}
],
"source": [
"s='my dear Holmes'.split()\n",
"prevw=BOS\n",
"prob=1.0\n",
"for w in s:\n",
" p = bigram_probs[prevw][index2vocab[w]]\n",
" print(p)\n",
" prob *= p\n",
" prevw = w\n",
"print(prob)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "naked-organ",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:markdown id:fifty-representation tags:
# This lab consists in implementing the basics of Language Modeling and recreate Shannon and Weaver's experiment on generating characters and words sequences.
%% Cell type:code id:minor-acting tags:
``` python
import numpy as np
from collections import Counter
np.random.seed()
BOS='<s>'
EOS='</s>'
UNK='<unk>'
```
%% Cell type:markdown id:eight-magnitude tags:
# Character models
%% Cell type:code id:speaking-impact tags:
``` python
vocab="abcdefghijklmnopqrstuvwxyz "
vocab_array = list(vocab)
vocab_len = len(vocab)
print("Vocab:{} [{}]".format(vocab, vocab_len))
```
%%%% Output: stream
Vocab:abcdefghijklmnopqrstuvwxyz [27]
%% Cell type:markdown id:understanding-monthly tags:
## Uniform distribution
%% Cell type:code id:essential-income tags:
``` python
txt_len = 100
probabilities = [1/vocab_len]*vocab_len
s = np.random.choice(vocab_array, size=txt_len, replace=True, p=probabilities)
#s = np.random.randint(0,vocab_len,txt_len)
for i in s:
print("{}".format(i), end='')
```
%%%% Output: stream
tlmyyqtpyswslcpjecwfptkjkf lmhmsppkctqhwjlonwfjwqnfrgwjeviahfylccgiergnuchuwiceijmybytvuzfzlxnlqimad
%% Cell type:markdown id:operational-beginning tags:
## Following letter frequency in English
See e.g. https://en.wikipedia.org/wiki/Letter_frequency and http://norvig.com/mayzner.html and http://www.fitaly.com/board/domper3/posts/136.html
%% Cell type:code id:residential-marketing tags:
``` python
probabilities = [0.075, 0.01, 0.02, 0.04, 0.11, 0.02, 0.01,
0.055, 0.06, 0.001, 0.007, 0.03, 0.02, 0.06,
0.065, 0.014, 0.00095, 0.055, 0.06, 0.09, 0.02,
0.008, 0.02, 0.001, 0.002, 0.0007, 0.14534999999999984]
s = np.random.choice(vocab_array, size=txt_len, replace=True, p=probabilities)
for i in s:
print("{}".format(i), end='')
```
%%%% Output: stream
toi pupaeogtl esietc t eofuhnaotnlh ancknfp v daeftdaocrddlnyocwnno aetagrarha ethpsphopo nlsitkaht
%% Cell type:markdown id:controlling-tuesday tags:
## Using a corpus - Sherlock Holmes novel
%% Cell type:code id:assisted-burden tags:
``` python
# Read the Sherlock Holmes novel
filename='TheAdventuresOfSherlockHolmes.txt'
with open(filename, 'rt') as fd:
text = list(fd.read().lower())
# Get character counts and frequencies
counts=Counter(text)
print("COUNTS:", counts, '[{}]'.format(len(counts)))
# Create vocabulary woth characters occuring
#vocabulary = set(text_array) # Use all characters in vocabulary
vocabulary = [ x for x in set(text) if counts[x] > 10 ] # Use characters appearing more than N times
print("VOCABULARY:", vocabulary, '[{}]'.format(len(vocabulary)))
# Get probabilities by relative frequency
# Get the denominator
sum_counts = sum([c for e,c in counts.items() if e in vocabulary])
# Compute the probabilities
probabilities = [counts[e]/sum_counts for e in vocabulary]
print("PROBABILITIES:", probabilities)
# Sampling from the distribution
s = np.random.choice(vocabulary, size=txt_len, replace=True, p=probabilities)
for i in s:
print("{}".format(i), end='')
```
%%%% Output: stream
COUNTS: Counter({' ': 95681, 'e': 53169, 't': 39034, 'a': 35159, 'o': 33536, 'i': 30156, 'h': 29077, 'n': 28682, 's': 27192, 'r': 24547, 'd': 18540, 'l': 17166, 'u': 13099, '\n': 11944, 'm': 11798, 'w': 11274, 'c': 10522, 'y': 9445, 'f': 8986, 'g': 7906, ',': 7662, 'p': 6806, 'b': 6378, '.': 6211, 'v': 4455, 'k': 3551, '“': 2764, '”': 2325, '’': 1051, '-': 741, '?': 738, 'x': 549, 'j': 458, '‘': 434, 'q': 427, '!': 346, ';': 202, '—': 191, 'z': 150, '_': 142, '0': 86, '1': 65, ':': 62, '2': 39, '8': 38, '£': 37, '4': 22, '7': 18, '6': 16, '5': 16, '9': 15, '3': 15, 'é': 12, '&': 7, '*': 6, 'æ': 6, '(': 5, ')': 5, 'œ': 2, "'": 1, '[': 1, '#': 1, ']': 1, '½': 1, 'à': 1, 'â': 1, 'è': 1}) [67]
VOCABULARY: ['d', ',', '—', 'f', '4', 'u', 'w', '“', 'o', '!', 's', '_', '‘', 'e', 'k', '?', '”', '3', '9', 'x', 'i', 'y', 't', '2', '-', '8', '’', '1', '6', 'm', 'h', 'p', 'r', '0', '.', ':', 'b', 'c', '£', 'g', 'é', 'j', 'n', 'v', '\n', 'q', ';', 'l', '5', '7', 'z', ' ', 'a'] [53]
PROBABILITIES: [0.03293453062964641, 0.013610807642090116, 0.0003392931688383206, 0.015962766571629053, 3.908088855729347e-05, 0.0232691163278176, 0.02002717898158757, 0.004909980726016325, 0.0595734853935179, 0.0006146357927647064, 0.04830397825681473, 0.00025224937159707604, 0.0007709593469938803, 0.09444962562285165, 0.006308010693952233, 0.0013109861706946627, 0.004130139358895787, 2.664606037997282e-05, 2.664606037997282e-05, 0.0009752458099070053, 0.053569239787897356, 0.01677813601925622, 0.0693401547247906, 6.927975698792933e-05, 0.0013163153827706574, 6.750335296259781e-05, 0.001867000630623429, 0.00011546626164654889, 2.8422464405304343e-05, 0.02095801469086129, 0.051652499844564645, 0.012090205796406335, 0.043605389609812854, 0.00015277074617851084, 0.01103324540133408, 0.00011013704957055433, 0.011329904873564443, 0.018691323154538267, 6.572694893726629e-05, 0.014044250224271007, 2.1316848303978258e-05, 0.0008135930436018368, 0.050950820254558694, 0.007913879932851928, 0.02121736967855969, 0.0007585245188165596, 0.0003588336131169673, 0.030493751498840895, 2.8422464405304343e-05, 3.1975272455967385e-05, 0.0002664606037997282, 0.1699681135477453, 0.06245658912663096]
’ya gfy,ai?e eeprhlo tiek f“ rl
r reoenn ’ eayeeac aa otpwrrmc etlcra heprlab sfyianntrsu.ounhimh
%% Cell type:code id:worth-quarterly tags:
``` python
# Get the counts for a certain order
# Smoothing: value betwwen 0 and 1 to smooth model
def count_order(text, vocabulary, order=1, smoothing=0):
assert smoothing >= 0 and smoothing <= 1
# Get counts
counts = {}
prevw = None
for w in text_tok:
if w in unigram_counts:
unigram_counts[w] += 1
else:
unigram_counts[w] = 1
if prevw is not None:
if prevw not in bigram_counts:
bigram_counts[prevw] = {}
if w in bigram_counts[prevw]:
bigram_counts[prevw][w] += 1
else:
bigram_counts[prevw][w] = 1
if w == EOS:
prevw = None
else:
prevw = w
```
%% Cell type:markdown id:stuck-cement tags:
# Word level bigram model
%% Cell type:code id:continuous-flesh tags:
``` python
# Read the Sherlock Holmes novel
filename='TheAdventuresOfSherlockHolmes.txt'
with open(filename, 'rt') as fd:
#text = fd.read().lower()
text = fd.read()
# Tokenize sentences and words
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
text_sent = sent_tokenize(text)
print("text sent:", text_sent[:10])
# Preprend BOS and append EOS
text_tok = []
for s in text_sent:
words = word_tokenize(s)
words.insert(0, BOS)
words.append(EOS)
text_tok.append(words)
text_tok = [word for sentence in text_tok for word in sentence]
#print("text tok:", text_tok[:100])
# Get character counts and frequencies
counts=Counter(text_tok)
#print("COUNTS:", counts, '[{}]'.format(len(counts)))
#vocabulary = list(set(text_tok)) # Use all words
vocabulary = [ x for x in set(text_tok) if counts[x] > 3 ] # Use characters appearing more than N times