Commit 6b36d160 authored by Loïc Barrault's avatar Loïc Barrault
Browse files

Added lab shannon

parent 3e6380c7
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:markdown id:fifty-representation tags:
# This lab consists in implementing the basics of Language Modeling and recreate Shannon and Weaver's experiment on generating characters and words sequences.
%% Cell type:code id:minor-acting tags:
``` python
import numpy as np
from collections import Counter
np.random.seed()
BOS='<s>'
EOS='</s>'
UNK='<unk>'
```
%% Cell type:markdown id:eight-magnitude tags:
# Character models
%% Cell type:code id:speaking-impact tags:
``` python
vocab="abcdefghijklmnopqrstuvwxyz "
vocab_array = list(vocab)
vocab_len = len(vocab)
print("Vocab:{} [{}]".format(vocab, vocab_len))
```
%% Output
Vocab:abcdefghijklmnopqrstuvwxyz [27]
%% Cell type:markdown id:understanding-monthly tags:
## Uniform distribution
%% Cell type:code id:essential-income tags:
``` python
txt_len = 100
probabilities = [1/vocab_len]*vocab_len
s = np.random.choice(vocab_array, size=txt_len, replace=True, p=probabilities)
#s = np.random.randint(0,vocab_len,txt_len)
for i in s:
print("{}".format(i), end='')
```
%% Output
tlmyyqtpyswslcpjecwfptkjkf lmhmsppkctqhwjlonwfjwqnfrgwjeviahfylccgiergnuchuwiceijmybytvuzfzlxnlqimad
%% Cell type:markdown id:operational-beginning tags:
## Following letter frequency in English
See e.g. https://en.wikipedia.org/wiki/Letter_frequency and http://norvig.com/mayzner.html and http://www.fitaly.com/board/domper3/posts/136.html
%% Cell type:code id:residential-marketing tags:
``` python
probabilities = [0.075, 0.01, 0.02, 0.04, 0.11, 0.02, 0.01,
0.055, 0.06, 0.001, 0.007, 0.03, 0.02, 0.06,
0.065, 0.014, 0.00095, 0.055, 0.06, 0.09, 0.02,
0.008, 0.02, 0.001, 0.002, 0.0007, 0.14534999999999984]
s = np.random.choice(vocab_array, size=txt_len, replace=True, p=probabilities)
for i in s:
print("{}".format(i), end='')
```
%% Output
toi pupaeogtl esietc t eofuhnaotnlh ancknfp v daeftdaocrddlnyocwnno aetagrarha ethpsphopo nlsitkaht
%% Cell type:markdown id:controlling-tuesday tags:
## Using a corpus - Sherlock Holmes novel
%% Cell type:code id:assisted-burden tags:
``` python
# Read the Sherlock Holmes novel
filename='TheAdventuresOfSherlockHolmes.txt'
with open(filename, 'rt') as fd:
text = list(fd.read().lower())
# Get character counts and frequencies
counts=Counter(text)
print("COUNTS:", counts, '[{}]'.format(len(counts)))
# Create vocabulary woth characters occuring
#vocabulary = set(text_array) # Use all characters in vocabulary
vocabulary = [ x for x in set(text) if counts[x] > 10 ] # Use characters appearing more than N times
print("VOCABULARY:", vocabulary, '[{}]'.format(len(vocabulary)))
# Get probabilities by relative frequency
# Get the denominator
sum_counts = sum([c for e,c in counts.items() if e in vocabulary])
# Compute the probabilities
probabilities = [counts[e]/sum_counts for e in vocabulary]
print("PROBABILITIES:", probabilities)
# Sampling from the distribution
s = np.random.choice(vocabulary, size=txt_len, replace=True, p=probabilities)
for i in s:
print("{}".format(i), end='')
```
%% Output
COUNTS: Counter({' ': 95681, 'e': 53169, 't': 39034, 'a': 35159, 'o': 33536, 'i': 30156, 'h': 29077, 'n': 28682, 's': 27192, 'r': 24547, 'd': 18540, 'l': 17166, 'u': 13099, '\n': 11944, 'm': 11798, 'w': 11274, 'c': 10522, 'y': 9445, 'f': 8986, 'g': 7906, ',': 7662, 'p': 6806, 'b': 6378, '.': 6211, 'v': 4455, 'k': 3551, '“': 2764, '”': 2325, '’': 1051, '-': 741, '?': 738, 'x': 549, 'j': 458, '‘': 434, 'q': 427, '!': 346, ';': 202, '—': 191, 'z': 150, '_': 142, '0': 86, '1': 65, ':': 62, '2': 39, '8': 38, '£': 37, '4': 22, '7': 18, '6': 16, '5': 16, '9': 15, '3': 15, 'é': 12, '&': 7, '*': 6, 'æ': 6, '(': 5, ')': 5, 'œ': 2, "'": 1, '[': 1, '#': 1, ']': 1, '½': 1, 'à': 1, 'â': 1, 'è': 1}) [67]
VOCABULARY: ['d', ',', '—', 'f', '4', 'u', 'w', '“', 'o', '!', 's', '_', '‘', 'e', 'k', '?', '”', '3', '9', 'x', 'i', 'y', 't', '2', '-', '8', '’', '1', '6', 'm', 'h', 'p', 'r', '0', '.', ':', 'b', 'c', '£', 'g', 'é', 'j', 'n', 'v', '\n', 'q', ';', 'l', '5', '7', 'z', ' ', 'a'] [53]
PROBABILITIES: [0.03293453062964641, 0.013610807642090116, 0.0003392931688383206, 0.015962766571629053, 3.908088855729347e-05, 0.0232691163278176, 0.02002717898158757, 0.004909980726016325, 0.0595734853935179, 0.0006146357927647064, 0.04830397825681473, 0.00025224937159707604, 0.0007709593469938803, 0.09444962562285165, 0.006308010693952233, 0.0013109861706946627, 0.004130139358895787, 2.664606037997282e-05, 2.664606037997282e-05, 0.0009752458099070053, 0.053569239787897356, 0.01677813601925622, 0.0693401547247906, 6.927975698792933e-05, 0.0013163153827706574, 6.750335296259781e-05, 0.001867000630623429, 0.00011546626164654889, 2.8422464405304343e-05, 0.02095801469086129, 0.051652499844564645, 0.012090205796406335, 0.043605389609812854, 0.00015277074617851084, 0.01103324540133408, 0.00011013704957055433, 0.011329904873564443, 0.018691323154538267, 6.572694893726629e-05, 0.014044250224271007, 2.1316848303978258e-05, 0.0008135930436018368, 0.050950820254558694, 0.007913879932851928, 0.02121736967855969, 0.0007585245188165596, 0.0003588336131169673, 0.030493751498840895, 2.8422464405304343e-05, 3.1975272455967385e-05, 0.0002664606037997282, 0.1699681135477453, 0.06245658912663096]
’ya gfy,ai?e eeprhlo tiek f“ rl
r reoenn ’ eayeeac aa otpwrrmc etlcra heprlab sfyianntrsu.ounhimh
%% Cell type:code id:worth-quarterly tags:
``` python
# Get the counts for a certain order
# Smoothing: value betwwen 0 and 1 to smooth model
def count_order(text, vocabulary, order=1, smoothing=0):
assert smoothing >= 0 and smoothing <= 1
# Get counts
counts = {}
prevw = None
for w in text_tok:
if w in unigram_counts:
unigram_counts[w] += 1
else:
unigram_counts[w] = 1
if prevw is not None:
if prevw not in bigram_counts:
bigram_counts[prevw] = {}
if w in bigram_counts[prevw]:
bigram_counts[prevw][w] += 1
else:
bigram_counts[prevw][w] = 1
if w == EOS:
prevw = None
else:
prevw = w
```
%% Cell type:markdown id:stuck-cement tags:
# Word level bigram model
%% Cell type:code id:continuous-flesh tags:
``` python
# Read the Sherlock Holmes novel
filename='TheAdventuresOfSherlockHolmes.txt'
with open(filename, 'rt') as fd:
#text = fd.read().lower()
text = fd.read()
# Tokenize sentences and words
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
text_sent = sent_tokenize(text)
print("text sent:", text_sent[:10])
# Preprend BOS and append EOS
text_tok = []
for s in text_sent:
words = word_tokenize(s)
words.insert(0, BOS)
words.append(EOS)
text_tok.append(words)
text_tok = [word for sentence in text_tok for word in sentence]
#print("text tok:", text_tok[:100])
# Get character counts and frequencies
counts=Counter(text_tok)
#print("COUNTS:", counts, '[{}]'.format(len(counts)))
#vocabulary = list(set(text_tok)) # Use all words
vocabulary = [ x for x in set(text_tok) if counts[x] > 3 ] # Use characters appearing more than N times
print("VOCAB:", vocabulary[:10], '[{}]'.format(len(vocabulary)))
index2vocab = {w:i for (i,w) in enumerate(vocabulary)}
# Update the text, map unknown words to unk
text_tok = [word if word in vocabulary else UNK for word in text_tok]
#print("text tok:", text_tok[:100])