Commit c6d3dd6a authored by Loïc Barrault's avatar Loïc Barrault
Browse files

language model sheffield

parent 6b36d160
\documentclass[aspectratio=169,t, xcolor=table]{beamer}
%\documentclass[handout,t]{beamer}
% pdf2ps cm_parole.pdf;
% intro, codage + DTW: psselect -p 1-35,51-61-63 cm_parole.ps > cm_parol_poly.ps
% DTW alone: psselect -p 1-19,35-47 cm_parole.ps > cm_parole_poly.ps
% decode psselect -p1-47,51-74 cm_parole.ps > cm_parole_poly.ps
% psnup -4 -H96mm -W128mm -m15mm -b6mm cm_parole_poly.ps cm_parole_poly.ps4
%
%\usepackage{pgfpages}
%\pgfpagelayout{4 on 1}{a4paper,landscape}
\mode<presentation>
{
%\usetheme{PaloAlto}
% \usetheme{Hannover}
\usetheme{informatics}
\useoutertheme{infolines}
% \setbeamercovered{transparent} % or whatever (possibly just delete it)
}
\setbeamertemplate{navigation symbols}{} % remove navigation symbols
\usefonttheme[onlymath]{serif}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{array}
\setlength{\extrarowheight}{3pt}
%\usepackage{xspace}
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
\usepackage{multicol}
\input ../macros.tex
\input ../macros_beamer.tex
\input ../mycolors.tex
\usepackage[absolute,showboxes,overlay]{textpos}
%\TPshowboxestrue % commenter une fois fini
\TPshowboxesfalse % décommenter pour faire disparaitre les boites
\textblockorigin{10mm}{10mm} % origine des positions
% This is only inserted into the PDF information catalog. Can be left out.
\subject{Statistical Language Modelling}
\title[]{Statistical Language Modelling}
\author[]{Loïc Barrault}
\institute[University of Sheffield]
{
l.barrault@sheffield.ac.uk \\
University of Sheffield\\
}
%\date{09 janvier 2017}
\date{}
% If you have a file called "university-logo-filename.xxx", where xxx
% is a graphic format that can be processed by latex or pdflatex,
% resp., then you can add a logo as follows:
%\pgfdeclareimage[height=0.5cm]{limsi-logo}{limsilogo}
%\logo{\pgfuseimage{limsi-logo}}
%\logo{\includegraphics[height=0.5cm]{limsilogo}}
%\logo{\epsfbox{limsilogo.eps}}
% Delete this, if you do not want the table of contents to pop up at
% the beginning of each subsection:
%\AtBeginSubsection[]
%{
% \begin{frame}<beamer>
% \frametitle{Outline}
% \tableofcontents[currentsection,currentsubsection]
% \end{frame}
%}
% If you wish to uncover everything in a step-wise fashion, uncomment
% the following command:
%\beamerdefaultoverlayspecification{<+->}
\newtheorem{conclusion}[theorem]{Conclusions}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
% tutorial TALN
% CM : SMT
%\input{mt_tutl.tex}
%Loic
%\input{mt_intro2.tex}
%\input{mt_ressources.tex}
%\input{mt_eval.tex}
%\input{mt_lm.tex}
%\input{mt_pivot.tex}
%\input{mt_datasel.tex}
\input{mt_lm_sheffield.tex}
%Holger
%\input{cslm.tex}
%\input{mt_align.tex}
%\input{mt_decode.tex}
%\input{mt_tools.tex}
%\input{mt_nlp2.tex}
%\input{mt_asr2.tex}
%\input{mt_concl2.tex}
% old stuff
%\input{mt_plan.tex}
%\input{mt_intro.tex}
%\input{mt_bleu.tex}
%\input{mt_tasks_light.tex}
%\input{mt_smt_light.tex}
%\input{mt_sys_light.tex}
%\input{mt_results.tex} % obsolete, results of 2007
%\input{mt_nlp.tex} % old
%\input{mt_asr.tex} % old
% and kill the abominable icon
\setbeamertemplate{bibliography item}{}
\begin{frame}[allowframebreaks]
\frametitle{References}
% \bibliographystyle{amsalpha}
\bibliographystyle{apalike}
% \bibliographystyle{plain}
\bibliography{refs}
\end{frame}
\end{document}
%!TEX root = m2_language_model_sheffield.tex
%\section{Statistical Language Modelling}
%\subsection{Introduction}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{frame}
% \frametitle{Plan}
%
%\begin{block}{}
% \begin{itemize}
% \item n-gram language models
% \item Non-parametric language models
% \item Parametric language models
% \end{itemize}
%\end{block}
%\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Natural Language Processing}
\vspace{\stretch{1}}
\begin{block}{}
In neuropsychology, linguistics, and the philosophy of language, a \textbf{natural language} or \textbf{ordinary language} is any language that has evolved naturally in humans through use and repetition without conscious planning or premeditation. Natural languages can take different forms, such as speech or signing. They are distinguished from constructed and formal languages such as those used to program computers or to study logic.
\null\hfill -- Wikipedia [\url{https://en.wikipedia.org/wiki/Natural_language}]
\end{block}
\vspace{\stretch{1}}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Language Model - What and why?}
\begin{block}{Aims of language models}
\begin{itemize}
\item Predict the future! ... words
\item Assign a probability to a sentence (or sequence of words)
\end{itemize}
\end{block}
\begin{block}{Many applications}
\begin{itemize}
\item Speech recognition
%\begin{itemize}
% \item Lame aise on bleu
% \item Là mais omble eux
% \item La mais on bleue
% \item La maison bleue \la\ This one is more probable!
% \item La maison bleu
%\end{itemize}
\begin{itemize}
\item I eight stake with whine
\item I ate steak with whine
\item I ate steak with wine \la\ This one is more probable!
\end{itemize}
\item Machine Translation: "La maison bleue"
\begin{itemize}
\item "The house, blue" \la\ feels not natural (less probable)
\item "The blue house" \la\ this one seems better! (more probable)
\end{itemize}
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Language Model - LM}
\begin{itemize}
\item Allows to distinguish between well written sentences and bad ones
\item Should give priority to grammatically and semantically correct sentences
\begin{itemize}
\item in a implicit fashion, no need for a syntactic nor semantic analysis
\item Monolingual process \ra\ no adequacy with source sentence here
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Language Model - LM}
\begin{itemize}
\item Goal: provide a non zero probability to {\bf all} sequences of words
\begin{itemize}
\item even for non-grammatical sentences
\item learned automatically from texts
\end{itemize}
\end{itemize}
\begin{block}{Issues:}
\begin{itemize}
\item How to assign a probability to a sequence of words?
\item How to deal with unseen words and sequences?
\item How to ensure good probability estimates?
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Language Model}
\begin{itemize}
\item Goal: provide a non zero probability to all sequences of words $W$ extracted from a {\bf vocabulary} $V$
\item Vocabulary: list of all words known by the model
\begin{itemize}
\item a specific word {\bf <unk>} to manage all the words not in $V$
\item word = sequence of characters without space
\item word $\ne$ linguistic word \ra\ token
\end{itemize}
\item[]
\item[] Let $ W = (w_1, w_2, \dots, w_n)$ with $w_i \in V$ be a word sequence
%\item[]
%\begin{center}
%$p(W) = \ds \prod_{i=1}^{T} p(w_i|h_i)$
%\end{center}
%\item[] with $h_i = (w_1, w_2, \dots, w_{i-1})$ the history of word $w_i$
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Complexity}
\begin{itemize}
\item Complexity for a vocabulary size of 65k
\begin{itemize}
\item $65k^2 = 4~225~000~000$ sequences of 2 words
\item $65k^3 = 2.74 \times 10^{14}$ sequences of 3 words
\item[]
\item[\ra] Second language learner: often struggle to learn more than 3k words after several years
\item[\ra] Native English speakers: 15k to 20k word families (lemmas)
\end{itemize}
%https://www.bbc.co.uk/news/world-44569277
%So does someone who can hold a decent conversation in a second language know 15,000 to 20,000 words? Is this a realistic goal for our listener to aim for? Unlikely.
%Prof Webb found that people who have been studying languages in a traditional setting - say French in Britain or English in Japan - often struggle to learn more than 2,000 to 3,000 words, even after years of study.
\hspace{1cm}
\item[\ra] We can't directly estimate the probability of a sequence by relative frequency!
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Complexity}
\begin{itemize}
\item Equivalence classes
\begin{itemize}
\item group histories in equivalence classes $\phi$
\item[]
\item[]
\begin{center}
\Large $p(W) \approx \ds \prod_{i=1}^{T} p(w_i| \phi(h_i))$
\end{center}
\item[]
\item Language modelling lies in determining $\phi$ and find a method to estimate the corresponding probabilities
\item[\ra] see work by Frederick Jelinek
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - n-gram}
\begin{itemize}
\item $n$-gram: sequence of $n$ words
\begin{itemize}
\item Ex.: "The pretty little blue house"
\item[\ra] bi-grams: "The pretty", "pretty little", "little blue", "blue house"
\item[\ra] tri-grams: "The pretty little", "pretty little blue", "little blue house"
\item[\ra] 4-grams: "The pretty little blue", "pretty little blue house"
\end{itemize}
\item[]
\item For a sequence of size $N$, there are $N$-1 bi-grams, $N$-2 tri-grams, ... $N$-$k$+1 $k$-grams
\item[]
\item $n$-gram model \ra\ equivalence class mapping history $h_i$ to the $n$-1 previous words
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Probabilities}
\begin{itemize}
\item How to estimate the n-gram probabilities?
\item Maximum Likelihood Estimation (MLE)
\begin{itemize}
\item Get counts from a \textbf{corpus}
\item \textbf{normalize} them so that they are between 0 and 1
\end{itemize}
\item Unigram probabilities
\item[] \centerline{ \Large $p(w_i) = \ds \frac{C(w_i)}{\ds \sum_{k} C(w_k)} = \ds \frac{C(w_i)}{ \mathrm{corpus~size}}$ }
\item[\ra] $C(.)$ is the counting function
\item $n$-gram probabilities
\item[] \centerline{ \Large $p(w_i| h_i^n) = \ds \frac{C(h_i^n w_i)}{C(h_i^n)}$ }
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Probabilities / Example}
\begin{block}{Corpus}
\begin{itemize}
\item \bos\ a blue house \eos
\item \bos\ a grey house \eos
\item \bos\ the grey house has the blue table \eos
\end{itemize}
\end{block}
\begin{block}{}
\begin{itemize}
\item Probabilities of some bi-grams:
\begin{itemize}
\item $P(a|\bos) = \ds \frac{2}{3} = 0.67$ ; $P(the|\bos) = \ds \frac{1}{3} = 0.33$ ; $P(\eos|house) = \ds \frac{2}{3} = 0.67$
\item $P(house|grey) = \ds \frac{2}{2} = 1$ ; $P(house|blue) = \ds \frac{1}{2} = 0.5$
\end{itemize}
\item Probabilities of some tri-grams:
\begin{itemize}
\item $P(blue|\bos\ a) = \ds \frac{1}{2} = 0.5$ ; $P(house|a\ blue) = \ds \frac{1}{1} = 1$
\end{itemize}
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - n-gram}
\begin{itemize}
\item bigram model: $\phi(h_i) = (w_{i-1})$
\item[] \centerline{ $p(W) \approx p(w_1) \times \ds \prod_{i=2}^T p(w_i|w_{i-1})$ }
\item trigram model: $\phi(h_i) = (w_{i-1}, w_{i-2})$
\item[] \centerline{ $p(W) \approx p(w_1)\times p(w_2|w_1) \times \ds \prod_{i=3}^T p(w_i|w_{i-1}, w_{i-2})$ }
\item $n$-gram: $\phi(h_i) = (w_{i-n+1}, \dots, w_{i-1})$
\item Consequences:
\begin{itemize}
\item $n$-1 words are enough to predict the next word \la\ \textbf{Markov} assumption.
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Sequence probability}
\begin{itemize}
\item How to compute the probability of a sequence ?
\item[\ra] By combining the $n$-gram probabilities!
\begin{center}
$p(W) = \ds \prod_{i=1}^{T} p(w_i|\phi(h_i))$
\end{center}
\item[] with $h_i = (w_1, w_2, \dots, w_{i-1})$ the history of word $w_i$
\item[] with $\phi(.)$ the function mapping the history to the equivalence classes of size $n$-1
\item[]
\item in practice: $n$ ranges to 4 or 5, barely 6 \Ra\ require exponential quantity of data
\end{itemize}
%\end{block}
\begin{block}{Example: bi-gram P(\bos\ the grey house \eos)}
\begin{itemize}
\item P(.) = P(the|\bos) * P(grey|the) * P(house|grey) * P(\eos|house) \\
~~~~~ = 0.33 * 0.5 * 0.67 * 0.67 = 0.0733333
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Characteristics}
\begin{itemize}
\item Language structure implicitly captured by $n$-grams
\begin{itemize}
\item probability of succeeding words, cooccurrences
\item same for semantics
\end{itemize}
\item Probabilities are independent from the position in the sentence
\begin{itemize}
\item add begin (\bos) and end (\eos) of sentence tokens
\end{itemize}
\item Probabilities are estimated using a large quantity of data (corpus), which are supposed to be {\bf well written}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Zipf's law}
%\vfill
%\begin{columns}
% \begin{column}[T]{.55\textwidth}
\begin{block}{Words follow a Zipf's law}
\begin{itemize}
\item a word’s frequency is inversely proportional to its rank in the word distribution list
\end{itemize}
\end{block}
% \end{column}%
%\hfill
% \begin{column}[T]{.45\textwidth}
\centerline{
\includegraphics[width=0.5\textwidth]{Zipf_30wiki_en_labels.png}
}
\begin{itemize}
\item[] {\scriptsize A plot of the rank versus frequency for the first 10 million words in 30 Wikipedias in a log-log scale.}
\end{itemize}
% \end{column}
%\end{columns}
%\vfill
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Unseen sequences}
\vspace{\stretch{1}}
\begin{itemize}
\item Wrong sequences that are not allowed by the language
\begin{itemize}
\item Ex.: "house the at blue", "this are wrong"
\end{itemize}
\item Correct sequences that are not seen in the training corpus
\item[\ra] How to avoid a zero probability?
\end{itemize}
\begin{block}{Solutions}
\begin{itemize}
\item Increase training corpus size
\item[\ra] makes training longer + can we ever get a perfect corpus?
\item Reserve a (small) probability mass to unseen events
\item[] \centerline{ \Large $\epsilon \geq p(w_i|h_i^n) > 0 ~ \forall i, \forall h$ }
\item[\ra] This is \textbf{smoothing} or \textbf{discounting}
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Smoothing}
\begin{itemize}
\item Idea:
\begin{enumerate}
\item take probability mass $D$ to seen events
\item then redistribute it to unseen events
\end{enumerate}
\item[]
\item Laplace smoothing (also known as \textbf{add 1} smoothing)
\item[] \centerline{ $\ds P_{Laplace}(w_i) = \frac{C(w_i)+1}{corpus\ size+V}$ }
\item \textbf{add-k} smoothing:
\item[] \centerline{ $\ds P_{add-k}(w_i) = \frac{C(w_i)+k}{corpus\ size+kV}$ with $0 < k < 1$ }
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Smoothing}
\begin{itemize}
\item Kneser-Ney smoothing: \liumcyan{absolute discounting} and \orange{continuation}
\item absolute discounting: subtract a certain (fixed) quantity to the counts
\item continuation: words seen in more contexts are more likely to appear in a new context
\begin{itemize}
\item Ex.: In a corpus "York" is more frequent than "table"
\item but seen only in the context of "New York", while "table" has many more contexts
\item[\ra] so higher probability of \orange{continuation}
\end{itemize}
\item[] \centerline{ $\ds P_{kn}(w_i|w_{i-1}) = \frac{C(w_{i-1}w_i) \liumcyan{ - d}}{C(w_{i-1})} + \orange{\lambda (w_{i-1}) P_{cont}(w_i)} $}
\item Going further: read the comparative study
\item[\ra] Stanley F. Chen and Joshua T. Goodman, \emph{An Empirical Study of Smoothing Techniques for Language Modelling}. Computer, Speech and Language, 13(4), pp. 359-394, 1999.
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Backoff}
\begin{itemize}
\item Idea: exploit lower order history
\item Backoff technique
\item[] \centerline{ \Large $ \tip(w_i|h_i^n) = \begin{cases} p^-(w_i|h_i^n) & \mbox{if } C(h_i^nw_i) > 0 \\ \alpha(h_i^n) p^-(w_i|h_i^{n-1}) & \mbox{if } C(h_i^nw_i) = 0 \end{cases} $ }
\item with $\alpha(h_i^n)$ the backoff weight
\item[\ra] computed so that probability distribution is respected (probs between 0 and 1 and sums to 1)
\item[]
\item See \cite{jurafsky2018}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - In practice}
\begin{itemize}
\item How to set the vocabulary?
\item Machine Translation:
\begin{itemize}
\item Use all the \sout{words} tokens belonging to {\bf in domain} corpora
\item[\ra] target side of train and development corpora
\item[\ra] specialized monolingual corpora
\item Most frequent \sout{words} tokens of large generic corpora
\item[\ra] seen at least $k$ times
\end{itemize}
\item Speech recognition:
\begin{itemize}
\item Only consider words than the speech decoder can produce
\item[\ra] map all others to \unk
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Training methodology}
\begin{itemize}
\item Merge training data, standard training procedure
\end{itemize}
\centerline{ \includegraphics[width=0.65\textwidth]{figures/lm_concat} }
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{LM - Training methodology}
\vfill
\begin{columns}
\begin{column}[T]{.55\textwidth}
\begin{itemize}
\item (log) linear interpolation
\item with $J$ models:
\end{itemize}
\centerline{ $p(w_i|h_i^n) = \ds \sum_{j=0}^J \lambda_j \cdot p_j(w_i|h_i^n)$ }
\begin{itemize}
\item[\ra] $\lambda_j$ are computed using an EM procedure
\end{itemize}
\end{column}%
\hfill
\begin{column}[T]{.45\textwidth}
\centerline{
\includegraphics[width=0.95\textwidth]{figures/lm_interpolation}
}
\end{column}
\end{columns}
\vfill
\end{frame}