mt_lm_sheffield.tex 21.3 KB
Newer Older
Loïc Barrault's avatar
Loïc Barrault committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
%!TEX root = m2_language_model_sheffield.tex

%\section{Statistical Language Modelling}
%\subsection{Introduction}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{frame}
%  \frametitle{Plan}
%
%\begin{block}{}
%    \begin{itemize}
%    \item n-gram language models
%    \item Non-parametric language models
%    \item Parametric language models
%    \end{itemize}
%\end{block}
%\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
  \frametitle{Natural Language Processing}

\vspace{\stretch{1}}

\begin{block}{}
In neuropsychology, linguistics, and the philosophy of language, a \textbf{natural language} or \textbf{ordinary language} is any language that has evolved naturally in humans through use and repetition without conscious planning or premeditation. Natural languages can take different forms, such as speech or signing. They are distinguished from constructed and formal languages such as those used to program computers or to study logic.

\null\hfill -- Wikipedia [\url{https://en.wikipedia.org/wiki/Natural_language}]

\end{block}
\vspace{\stretch{1}}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Language Model - What and why?}

\begin{block}{Aims of language models}
\begin{itemize}
\item Predict the future! ... words
\item Assign a probability to a sentence (or sequence of words)
\end{itemize}
\end{block}

\begin{block}{Many applications}
\begin{itemize}
\item Speech recognition
%\begin{itemize}
%	\item Lame aise on bleu
%	\item Là mais omble eux
%	\item La mais on bleue
%	\item La maison bleue \la\ This one is more probable!
%	\item La maison bleu
%\end{itemize}
\begin{itemize}
	\item I eight stake with whine
	\item I ate steak with whine
	\item I ate steak with wine \la\ This one is more probable!
\end{itemize}

\item Machine Translation: "La maison bleue"
\begin{itemize}
	\item "The house, blue" \la\ feels not natural (less probable)
	\item "The blue house" \la\ this one seems better! (more probable)
\end{itemize}
\end{itemize}
\end{block}

\end{frame}




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Language Model - LM}
\begin{itemize}
\item Allows to distinguish between well written sentences and bad ones
\item Should give priority to grammatically and semantically correct sentences
\begin{itemize}
\item in a implicit fashion, no need for a syntactic nor semantic analysis
\item Monolingual process \ra\ no adequacy with source sentence here
\end{itemize}
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Language Model - LM}
\begin{itemize}
\item Goal: provide a non zero probability to {\bf all} sequences of words
\begin{itemize}
\item even for non-grammatical sentences
\item learned automatically from texts
\end{itemize}
\end{itemize}

\begin{block}{Issues:}
\begin{itemize}
\item How to assign a probability to a sequence of words?
\item How to deal with unseen words and sequences?
\item How to ensure good probability estimates?
\end{itemize}
\end{block}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Language Model}
\begin{itemize}
\item Goal: provide a non zero probability to all sequences of words $W$ extracted from a {\bf vocabulary} $V$
\item Vocabulary: list of all words known by the model
\begin{itemize}
\item a specific word {\bf <unk>} to manage all the words not in $V$
\item word = sequence of characters without space
\item word $\ne$ linguistic word \ra\ token
\end{itemize}
\item[]
\item[] Let $ W = (w_1, w_2, \dots, w_n)$ with $w_i \in V$ be a word sequence
%\item[]
%\begin{center}
%$p(W) = \ds \prod_{i=1}^{T} p(w_i|h_i)$
%\end{center}
%\item[] with $h_i = (w_1, w_2, \dots, w_{i-1})$ the history of word $w_i$
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Complexity}
\begin{itemize}
\item Complexity for a vocabulary size of 65k
\begin{itemize}
\item $65k^2 = 4~225~000~000$  sequences of 2 words
\item $65k^3 = 2.74 \times 10^{14}$ sequences of 3 words
\item[] 
\item[\ra] Second language learner: often struggle to learn more than 3k words after several years
\item[\ra] Native English speakers: 15k to 20k word families (lemmas)
\end{itemize}

%https://www.bbc.co.uk/news/world-44569277
%So does someone who can hold a decent conversation in a second language know 15,000 to 20,000 words? Is this a realistic goal for our listener to aim for? Unlikely.
%Prof Webb found that people who have been studying languages in a traditional setting - say French in Britain or English in Japan - often struggle to learn more than 2,000 to 3,000 words, even after years of study. 
\hspace{1cm}
\item[\ra] We can't directly estimate the probability of a sequence by relative frequency!

\end{itemize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Complexity}
\begin{itemize}
\item Equivalence classes
\begin{itemize}
\item group histories in equivalence classes $\phi$
\item[] 
\item[] 
\begin{center}
\Large $p(W) \approx \ds \prod_{i=1}^{T} p(w_i| \phi(h_i))$
\end{center}
\item[] 
\item Language modelling lies in determining $\phi$ and find a method to estimate the corresponding probabilities
\item[\ra] see work by Frederick Jelinek

\end{itemize}
\end{itemize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - n-gram}
\begin{itemize}
\item $n$-gram: sequence of $n$ words
\begin{itemize}
\item Ex.: "The pretty little blue house"
\item[\ra] bi-grams: "The pretty", "pretty little", "little blue", "blue house"
\item[\ra] tri-grams: "The pretty little", "pretty little blue", "little blue house"
\item[\ra] 4-grams: "The pretty little blue", "pretty little blue house"
\end{itemize}
\item[] 
\item For a sequence of size $N$, there are $N$-1 bi-grams, $N$-2 tri-grams, ... $N$-$k$+1 $k$-grams
\item[] 
\item $n$-gram model \ra\ equivalence class mapping history $h_i$ to the $n$-1 previous words
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Probabilities}

\begin{itemize}
\item How to estimate the n-gram probabilities?
\item Maximum Likelihood Estimation (MLE)
\begin{itemize}
\item Get counts from a \textbf{corpus}
\item \textbf{normalize} them so that they are between 0 and 1
\end{itemize}
\item Unigram probabilities
\item[] \centerline{ \Large $p(w_i) = \ds \frac{C(w_i)}{\ds \sum_{k} C(w_k)} = \ds \frac{C(w_i)}{ \mathrm{corpus~size}}$ }
\item[\ra]  $C(.)$ is the counting function
\item $n$-gram probabilities
\item[] \centerline{ \Large $p(w_i| h_i^n) = \ds \frac{C(h_i^n w_i)}{C(h_i^n)}$ }
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Probabilities / Example}

\begin{block}{Corpus}
\begin{itemize}
\item \bos\ a blue house \eos
\item \bos\ a grey house \eos
\item \bos\ the grey house has the blue table \eos
\end{itemize}
\end{block}

\begin{block}{}
\begin{itemize}
\item Probabilities of some bi-grams:
\begin{itemize}
\item $P(a|\bos) = \ds \frac{2}{3} = 0.67$ ; $P(the|\bos) = \ds \frac{1}{3} = 0.33$ ; $P(\eos|house) = \ds \frac{2}{3} = 0.67$
\item $P(house|grey) = \ds \frac{2}{2} = 1$ ; $P(house|blue) = \ds \frac{1}{2} = 0.5$
\end{itemize}

\item Probabilities of some tri-grams:
\begin{itemize}
\item $P(blue|\bos\ a) = \ds \frac{1}{2} = 0.5$ ; $P(house|a\ blue) = \ds \frac{1}{1} = 1$ 
\end{itemize}

\end{itemize}
\end{block}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - n-gram}
\begin{itemize}
\item bigram model: $\phi(h_i) = (w_{i-1})$ 
\item[] \centerline{ $p(W) \approx p(w_1) \times \ds \prod_{i=2}^T p(w_i|w_{i-1})$ }
\item trigram model: $\phi(h_i) = (w_{i-1}, w_{i-2})$ 
\item[] \centerline{ $p(W) \approx p(w_1)\times p(w_2|w_1) \times \ds \prod_{i=3}^T p(w_i|w_{i-1}, w_{i-2})$ }
\item $n$-gram: $\phi(h_i) = (w_{i-n+1}, \dots, w_{i-1})$
\item Consequences:
\begin{itemize}
\item $n$-1 words are enough to predict the next word \la\ \textbf{Markov} assumption.
\end{itemize}
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Sequence probability}
\begin{itemize}
\item How to compute the probability of a sequence ?
\item[\ra] By combining the $n$-gram probabilities!
\begin{center}
$p(W) = \ds \prod_{i=1}^{T} p(w_i|\phi(h_i))$
\end{center}
\item[] with $h_i = (w_1, w_2, \dots, w_{i-1})$ the history of word $w_i$
\item[] with $\phi(.)$ the function mapping the history to the equivalence classes of size $n$-1
\item[]
\item in practice:  $n$ ranges to 4 or 5, barely 6 \Ra\ require exponential quantity of data 
\end{itemize}
%\end{block}

\begin{block}{Example: bi-gram P(\bos\ the grey house \eos)}
\begin{itemize}
\item P(.) = P(the|\bos) * P(grey|the) * P(house|grey) * P(\eos|house) \\
~~~~~ =  0.33 * 0.5 * 0.67 * 0.67 = 0.0733333
\end{itemize}
\end{block}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Characteristics}
\begin{itemize}
\item Language structure implicitly captured by $n$-grams
\begin{itemize}
\item probability of succeeding words, cooccurrences
\item same for semantics
\end{itemize}
\item Probabilities are independent from the position in the sentence
\begin{itemize}
\item add begin (\bos) and end (\eos) of sentence tokens
\end{itemize}
\item Probabilities are estimated using a large quantity of data (corpus), which are supposed to be {\bf well written}
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{LM - Zipf's law}

%\vfill
%\begin{columns}
%	\begin{column}[T]{.55\textwidth}
            \begin{block}{Words follow a Zipf's law}
		\begin{itemize}
		\item a word’s frequency is inversely proportional to its rank in the word distribution list
		\end{itemize}
	   \end{block}

%	\end{column}%
%\hfill
%	\begin{column}[T]{.45\textwidth}
	\centerline{
		  \includegraphics[width=0.5\textwidth]{Zipf_30wiki_en_labels.png} 
	}
	\begin{itemize}
	\item[] {\scriptsize A plot of the rank versus frequency for the first 10 million words in 30 Wikipedias in a log-log scale.}
	\end{itemize}
%	\end{column}
%\end{columns}
%\vfill
\end{frame}




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}
\frametitle{LM - Unseen sequences}
\vspace{\stretch{1}}

	   \begin{itemize}
            \item Wrong sequences that are not allowed by the language
            \begin{itemize}
            \item Ex.: "house the at blue", "this are wrong" 
            \end{itemize}
            \item Correct sequences that are not seen in the training corpus
            \item[\ra] How to avoid a zero probability?
            \end{itemize}


\begin{block}{Solutions}
\begin{itemize}
\item Increase training corpus size
\item[\ra] makes training longer + can we ever get a perfect corpus?
\item Reserve a (small) probability mass to unseen events
\item[] \centerline{ \Large $\epsilon \geq p(w_i|h_i^n) > 0 ~ \forall i, \forall h$ }
\item[\ra] This is \textbf{smoothing} or \textbf{discounting}
\end{itemize}
\end{block}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Smoothing}
\begin{itemize}
\item Idea: 
\begin{enumerate}
\item take probability mass $D$ to seen events
\item then redistribute it to unseen events
\end{enumerate}

\item[]
\item Laplace smoothing (also known as \textbf{add 1} smoothing)
\item[]  \centerline{ $\ds  P_{Laplace}(w_i) = \frac{C(w_i)+1}{corpus\ size+V}$ }
\item \textbf{add-k} smoothing:  
\item[] \centerline{ $\ds  P_{add-k}(w_i) = \frac{C(w_i)+k}{corpus\ size+kV}$  with $0 < k < 1$ }
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Smoothing}
\begin{itemize}
\item Kneser-Ney smoothing: \liumcyan{absolute discounting} and \orange{continuation}
\item absolute discounting: subtract a certain (fixed) quantity to the counts
\item continuation: words seen in more contexts are more likely to appear in a new context
\begin{itemize}
\item Ex.: In a corpus "York" is more frequent than "table"
\item but seen only in the context of "New York", while "table" has many more contexts
\item[\ra] so higher probability of \orange{continuation}
\end{itemize}
\item[] \centerline{ $\ds  P_{kn}(w_i|w_{i-1}) = \frac{C(w_{i-1}w_i) \liumcyan{ - d}}{C(w_{i-1})} + \orange{\lambda (w_{i-1}) P_{cont}(w_i)} $}
\item Going further: read the comparative study
\item[\ra]  Stanley F. Chen and Joshua T. Goodman, \emph{An Empirical Study of Smoothing Techniques for Language Modelling}. Computer, Speech and Language, 13(4), pp. 359-394, 1999.
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Backoff}
\begin{itemize}
\item Idea: exploit lower order history
\item Backoff technique
\item[] \centerline{ \Large  $ \tip(w_i|h_i^n) = \begin{cases} p^-(w_i|h_i^n) & \mbox{if } C(h_i^nw_i) > 0 \\ \alpha(h_i^n) p^-(w_i|h_i^{n-1}) & \mbox{if } C(h_i^nw_i) = 0 \end{cases} $ }
\item with $\alpha(h_i^n)$ the backoff weight
\item[\ra] computed so that probability distribution is respected (probs between 0 and 1 and sums to 1)
\item[]
\item See \cite{jurafsky2018}
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - In practice}
\begin{itemize}
\item How to set the vocabulary?
\item Machine Translation:
	\begin{itemize}
	\item Use all the \sout{words} tokens belonging to {\bf in domain} corpora
	\item[\ra] target side of train and development corpora
	\item[\ra] specialized monolingual corpora
	\item Most frequent \sout{words} tokens of large generic corpora
	\item[\ra] seen at least $k$ times
	\end{itemize}
\item Speech recognition:
	\begin{itemize}
	\item Only consider words than the speech decoder can produce
	\item[\ra] map all others to \unk
	\end{itemize}
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Training methodology}
\begin{itemize}
\item Merge training data, standard training procedure
\end{itemize}
\centerline{   \includegraphics[width=0.65\textwidth]{figures/lm_concat} }
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Training methodology}

\vfill
\begin{columns}
	\begin{column}[T]{.55\textwidth}
            \begin{itemize}
            \item (log) linear interpolation
            \item with $J$ models:
            \end{itemize}
            \centerline{   $p(w_i|h_i^n) = \ds \sum_{j=0}^J \lambda_j \cdot p_j(w_i|h_i^n)$ }
            \begin{itemize}
            \item[\ra] $\lambda_j$ are computed using an EM procedure
            \end{itemize}
	\end{column}%
\hfill
	\begin{column}[T]{.45\textwidth}
	\centerline{
		  \includegraphics[width=0.95\textwidth]{figures/lm_interpolation} 
	}
	\end{column}
\end{columns}
\vfill
\end{frame}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - perplexity}
\begin{itemize}
\item Perplexity: measures the ability of the model to predict word of an unseen text
\begin{itemize}
\item unseen = not in training data
\item criterion to be minimized
\item Let's consider a corpus $W = w_1w_2 ... w_N$
\end{itemize}
\end{itemize}
\begin{eqnarray*}
PPL(T) & = & P(w_1w_2 ... w_N)^{ -  \frac{1}{N} }  \\
	& = & \sqrt[\leftroot{-2}\uproot{2}N]{\frac{1}{P(w_1w_2 ... w_N)}}  \\
	& = & \sqrt[\leftroot{-2}\uproot{2}N]{ \prod_{i=1}^N \frac{1}{P(w_i|w_1 ... w_{i-1})}}  \\
\end{eqnarray*}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - perplexity and cross entropy}
\begin{eqnarray*}
PPL(T) & = & 2^{H(W)} \mbox{~~~~~{\small [$H$ is the cross-entropy}]}\\
H(T)	& = & - \ds \frac{1}{N} \log P(w_i|w_1 ... w_{i-1}) \\
\end{eqnarray*}

\begin{itemize}
\item Perplexity is linked to the number of possible next words that can follow any word
\item[\ra] How accurately can the model predict the next word?
\end{itemize}

\begin{block}{Example with the digits (zero, one, two, ..., nine)}
\begin{itemize}
\item if equilibrated corpus, i.e.  $P = \frac{1}{10}$ for all words
\centerline{$ PPL(W)  = P(w_1w_2...w_N)^{ -  \frac{1}{N} } = \left(  \frac{1}{10^N}  \right) ^{ -  \frac{1}{N} }  = \left(  \frac{1}{10}  \right) ^{ - 1} = 10 $}
\item This quantity would reduce if a word is more frequent 
\item[\ra] ex: "zero" is 10 times more frequent
\end{itemize}
\end{block}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Dealing with OOVs and low frequency words}
\begin{itemize}
\item Replace all words \textbf{not} in the vocabulary by the token \unk
\item Compute \unk probability similarly to the other words.
\item Apply this rule to words appearing less than $n$ times in the training corpus.
\item[\ra] Allows to select and fix the vocabulary size
\end{itemize}
\begin{itemize}
\item Choice of words mapped to \unk impacts perplexity
\item[\ra] a small vocabulary and a larger \unk probability will reduce the perplexity
\item[\Ra] Always compare perplexities of models having same vocabulary!
\end{itemize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Integrating word classes}
\vspace{\stretch{1}}
\begin{itemize}
\item Many words have similar usage
	\begin{itemize}
	\item numbers, days of the week, months, etc. 
	\end{itemize}
\item Performance can be improved with word clustering
	\begin{itemize}
	\item replace all words in a cluster by a specific token
	\item[\ra] similarly to \unk
	\end{itemize}
\item Clusters can be created manually (expert) or automatically
\end{itemize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Theory of communication}
\begin{itemize}
\item \textbf{Mathematical theory of communication} \cite{Shannon:1948}
\item Assume a 27-symbol “alphabet,” the 26 letters and a space.
\item[]
\item[]
\item Zero-order approximation (symbols independent and equiprobable).
\item[\ra] {\scriptsize XFOML RXKHRJFFJUJ ZLPWCFWKCYJ FFJEYVKCQSGHYD QPAAMKBZAACIBZLHJQD}

\item First-order approximation (symbols independent but with frequencies of English text).
\item[\ra] {\scriptsize OCRO HLI RGWR NMIELWIS EU LL NBNESEBYA TH EEI ALHENHTTPA OOBTTVA NAH BRL}

\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Theory of communication}
\begin{itemize}
\item Second-order approximation (digram structure as in English).
\item[\ra] {\scriptsize ON IE ANTSOUTINYS ARE T INCTORE ST BE S DEAMY ACHIN D ILONASIVE TUCOOWE AT TEASONARE FUSO TIZIN ANDY TOBE SEACE CTISBE.}

\item Third-order approximation (trigram structure as in English).
\item[\ra] {\scriptsize IN NO IST LAT WHEY CRATICT FROURE BIRS GROCID PONDENOME OF DEMONSTURES OF THE REPTAGIN IS REGOACTIONA OF CRE.}
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Theory of communication}
\begin{itemize}
\item First-order word approximation. Rather than continue with tetragram, ..., n-gram structure, it is easier and better to jump at this point to word units. Here words are chosen independently but with their appropriate frequencies.
\item[\ra] {\scriptsize REPRESENTING AND SPEEDILY IS AN GOOD APT OR COME CAN DIFFERENT NATURAL HERE HE THE A IN CAME THE TO OF TO EXPERT GRAY COME TO FURNISHES THE LINE MESSAGE HAD BE THESE.}

\item Second-order word approximation. The word transition probabilities are correct but no further structure is included.
\item[\ra] {\scriptsize THE HEAD AND IN FRONTAL ATTACK ON AN ENGLISH WRITER THAT THE CHAR- ACTER OF THIS POINT IS THEREFORE ANOTHER METHOD FOR THE LETTERS THAT THE TIME OF WHO EVER TOLD THE PROBLEM FOR AN UNEXPECTED.}

\end{itemize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Why do n-grams work so well?}
\vspace{\stretch{1}}
\begin{itemize}
\item Probabilities computed on a large corpus
\item[\ra] the more the better 
\item Implicit modelling of syntax and semantics
\item[\ra] Correct word sequence are more probable!
\item[\ra] e.g. number and gender agreement
\item Easy to integrate into search methods like Viterbi
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Issues with n-grams models}
\vspace{\stretch{1}}
\begin{itemize}
\item Cannot model long distance dependencies
\item[\ra] context size is limited (up to 6 in practice)
\item Poor modelling of
\begin{itemize}
\item new vocabulary words
\item domain
\item unprepared text (discourse)
\end{itemize}
\item[\ra] Do not capture the meaning
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{frame}
%\frametitle{LM - SRILM}
%\begin{itemize}
%\item See \cite{Stolcke:2002}
%\item Build a model: ngram-count
%\item[\ra] !! always specify order with {\bf -order N} and use of unknown class with {\bf -unk}
%\item Compute interpolation weights: compute-best-mix
%\item[] use the outputs of the following command:  {\bf ngram -debug 2 -ppl ...}
%\item Compute perplexity, interpolated model: ngram
%\item[] {\bf -ppl <dev corpus>}: compute perplexity on development corpus
%\item[] {\bf -mix-lmK <lmK> -mix-lambdaK <coeffK>} : interpolate several models <lmK> with weights <coeffK> (K ranging from 0 to 9)
%\end{itemize}
%\end{frame}