Commit dc6d8314 authored by Loïc Barrault's avatar Loïc Barrault
Browse files

Sequence modelling CVC

parent 0e8f6748
m2_DL_sequence_modeling_en.pdf
%!TEX root = m2_DL_sequence_modeling_en.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{}
\vspace{\stretch{1}}
\begin{block}{}
\begin{itemize}
\item[] \huge {Conditional Language Models}
\end{itemize}
\end{block}
\vspace{\stretch{1}}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Reminder: RNNLM}
\centering
\includegraphics[width=0.55\textwidth]{figures_en/rnn_unrolled_all}
\begin{itemize}
\item<+-> Probability of a word sequence $\vw = (w_1, w_2, ..., w_\ell)$
\item[]{ \small{ $ p(\vw) = p(w_1) \times p(w_2|w_1) \times p(w_3 | w_1, w_2) \times \dots \times p(w_l | w_1, ..., w_{\ell-1}) = \ds \prod_{t=1}^{\ell} p(w_t|w_1, ..., w_{t-1})$ } }
% \item[]<2>{ \small {$ p(\vw) = \ds \prod_{t=1}^{\ell} p(w_t|w_1, ..., w_{t-1}) $ } }
\item[\ra]<+-> \alert{Probability only depends on the history}
\item<+-> Can we use it to do machine translation or automatic summarization?
\item[\ra]<+-> We have to take source text into account!
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Conditional Language Models}
\begin{block}{Conditional Probability given a context $\edinred{\vm{x}}$}
\begin{itemize}
\item[] \item[]{ \small{ $ p(\vw | \edinred{\vm{x}}) = \ds \prod_{t=1}^{\ell} p(w_t| \edinred{\vm{x}}, w_1, ..., w_{t-1})$ } }
\item[\ra] Probability of the next word given history \textbf{and} source context $\edinred{\vm{x}}$ ?
\item[]
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Conditional Language Models}
%\begin{block}{Applications}
\rowcolors[]{1}{gray!20}{gray!10} % \rowcolors is in the xcolor manual
\centering
\begin{tabular}{l|l} % \vrule is to Add rule. can use |
Input $\edinred{\vm{x}}$ & Output $\vm{w}$ \\\midrule % \midrule instead of the old \hline
English sentence & Its Chinese translation \\
Sentence in Chinese & Its French Translation \\
An image & Its description \\
Speech signal & Its transcription \\
History of a dialogue & The answer of the dialogue system \\
A document + a question & The answer \\
An image + a question & The answer \Ra\ VQA \\
A document & A summary \\
... & ... \\
\end{tabular}
%\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Which data?}
%\begin{block}{}
\begin{itemize}
\item to train conditional LM, we need \textbf{annotated data}
\rowcolors[]{1}{gray!20}{gray!10} % \rowcolors is in the xcolor manual
{ \centering
\begin{tabular}{l|l}
Translation & Bilingual Texts (with sentence level alignment) \\
Automatic Speech Recognition & Speech signal + manual transcription \\
Image captioning & Images + their descriptions \\
... & ... \\
\end{tabular}}
\item Data quantity and quality may vary
\end{itemize}
%\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{What follows:}
%\begin{block}{}
\begin{itemize}
\item How to represent the source sequence with a fixed size vector $\edinred{\vm{x}}$ ?
\begin{itemize}
\item Problem specific (or at least modality specific).
%\item (Début de) réponse au cours précédent
\end{itemize}
\item[]
\item How to condition the decoder with $\edinred{\vm{x}}$ ?
\begin{itemize}
\item Less problem specific
\item Some architectures in this course
\end{itemize}
\end{itemize}
%\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Neural Model}
\begin{block}{Sequence to sequence}
\begin{itemize}
\item Current approach: encoder/decoder
\item The input sequence is encoded into a low dimension vector (few hundreds)
\item[\ra] \alert{Encoder}
\item Output sequence is generated:
\begin{itemize}
\item with a conditional LM
\item using a fixed size vector from the encoder
\item[\ra] word by word, stop when end token is generated ("$\langle /s\rangle$")
\end{itemize}
\item[\ra] \alert{Decoder}
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Back to the encoder}
\begin{block}{How to represent the source sequence with a fixed size vector $\edinred{\vm{x}}$ ? }
\begin{itemize}
\item Previous part: RNN, GRU, LSTM
\item What about this architecture? \\
{\centering
\includegraphics[width=0.35\textwidth]{figures_en/bow}
\item[]}
\item<2> \textbf{Bag of words} representation
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Back to the encoder}
\begin{block}{\cite{kalchbrenner2013} }
\begin{itemize}
\item[]
{ \centering
\includegraphics[width=0.35\textwidth]{figures_en/conv_sent_encoder}
\item[]
}
\item<+-> \edinred{Convolutional} encoder
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{How to condition on $\vm{x}$ ?}
\begin{block}{\cite{kalchbrenner2013} }
\centering
\includegraphics[width=0.95\textwidth]<+>{figures_en/rnn_unrolled_4}
\includegraphics[width=0.95\textwidth]<+>{figures_en/cond_rnn_unrolled_1}
\includegraphics[width=0.95\textwidth]<+>{figures_en/cond_rnn_unrolled_2}
\includegraphics[width=0.95\textwidth]<+>{figures_en/cond_rnn_unrolled_3}
\includegraphics[width=0.95\textwidth]<+>{figures_en/cond_rnn_unrolled_all}
\includegraphics[width=0.55\textwidth]<+>{figures_en/cond_rnn_unrolled_all}
\begin{itemize}
\item[]<.-> $ \vm{h}_t = \phi(\vm{M}[\vm{h}_{t-1}; \vm{w}_{t-1}] \edinred{+ \vm{x}} + \vm{b}) $
\item[]<.-> $ \vm{z}_t = \vm{S}~\vm{h}_{t} + \vm{b'} $
\item[]<.-> $ p(\vm{w}_t | \edinred{\vm{x}}, \vm{w}_{<t}) = softmax(\vm{u}_t) $
\item[]
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Machine Translation: \cite{sutskever2014}}
\begin{columns}
\column{0.5\textwidth}
\begin{block}{Architecture}
\begin{itemize}
\item Encoder: $LSTM(\edinred{\vm{x}_i}, \vm{c}_{i-1}, \vm{h}_{i-1})$
\item[\ra] Provides vector $\edinred{\vm{x}_{\ell}}$ with $\ell$ the considered fixed size.
\item[]
\item Decoder: $LSTM(\vm{w}_{t-1}, \edinred{\vm{x}_{\ell}}, \vm{h}_{t-1})$
\begin{itemize}
\item $w_0 = \langle s \rangle$ \\
\item[]
\item[]
\item[]
\item[]
\item[]
\end{itemize}
\end{itemize}
\end{block}
\column{0.5\textwidth} \\
\centering \includegraphics[height=4cm]{figures_en/lstm}
\end{columns}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%architecture
\begin{frame}
\frametitle{Machine Translation: \cite{sutskever2014}}
\begin{columns}
\column{0.5\textwidth}
\begin{block}{}
\begin{itemize}
\item<2->{ {\color{edinred} [1.]} ~Word encoded into \emph{1-hot} vector }
\item<3->{ {\color{cyan} [2.]} Projection into an \textbf{\textit{embedding}} }
\item<4->{ {\color{gray} [3.]} Update hidden state of \emph{\alert{encoder}} RNN }
\item[\ra]<5->{ {\color{liumgreen} [4.]} Sentence representation }
\item<6->{ {\color{edingreen} [5.]} Update hidden state of \emph{\alert{decoder}} RNN }
\begin{itemize}
\item<.-> previous state, source context $\vm{c}$, feedback
\end{itemize}
\item<7->{ {\color{purple} [6.]} Calculate probability distribution for \alert{all} words }
\item<8->{ {\color{orange} [7.]} Next word (most probable) }
\end{itemize}
\end{block}
\column{0.5\textwidth} \\
\centering{
\includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all}
\includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_1}
\includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_2}
\includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_3}
\includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_4}
\includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_5}
\includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_6}
\includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_7}
}%centering
\end{columns}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Results}
\begin{block}{}
\centerline{
\includegraphics[width=0.8\textwidth]{figures_en/nmt_sentence_length}
}
\begin{itemize}
\item[\ra] Translation score decreases with sentence size !
\item How to explain that?
\only<1>{ \begin{enumerate}
\item Not enough data in training corpus?
\item Difficult to generate a long coherent sentence?
\item Fixed size vector insufficient to encode a long sentence?
\end{enumerate}
}
\only<2>{ \begin{enumerate}
\item \sout{Not enough data in training corpus?}
\item \sout{Difficult to generate a long coherent sentence?}
\item \edinred{Fixed size vector insufficient to encode a long sentence !!!}
\end{enumerate}
}
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{RNNs for sentence representation}
\begin{block}{Conclusion}
\begin{itemize}
\item Pros:
\begin{itemize}
\item RNNs can naturally manage variable length sequences
\item GRU and LSTMs can \textbf{in principle} propagate gradients through a long chain of non-linearities
\item Very simple architecture in the end!
\end{itemize}
\item[]
\item Cons:
\begin{itemize}
\item Hidden state must memorize a large quantity of information!
\end{itemize}
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{RNNs for sentence representation}
\begin{block}{}
\begin{itemize}
\item A large quantity of data is compressed into a fixed size vector
\item Gradients have a long way to go
\begin{itemize}
\item Even LSTMs forget!
\end{itemize}
\item[]
\item Prof. Ray Mooney (U. Texas) :
\item[] "You can't cram the meaning of a whole \$\^{}!\# sentence into a single \$\#!\% vector!"
\item[]
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{From vector to matrix representation}
\begin{block}{}
\begin{itemize}
\item Represent input sequence with a matrix
\item Generate output sequence using the matrix
\item[]
\item[\ra] Solve the problem of capacity
\item[\ra] we also refer to model \textbf{expressivity}
\item[]
\item[\ra] Solve the problem of gradient stream
\item[]
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Representing sentences with a matrix}
\begin{block}{}
\begin{itemize}
\item Fixed size vector: regardless the input sequence size
\item[]
\item Matrix : number of rows is fixed
\item[\ra] number of characteristics (\textbf{features})
\item Number of columns = number of words in the source sequence (in general)
\item[]
\item[\ra] How to build this matrix?
\item[]
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Concatenation}
\begin{block}{}
\begin{itemize}
\item Concatenation of word embeddings
\item simplest possible model
% \item[\ra] no paper published so far, any volunteers?
\item[]
\item Using convolutional networks \cite{gehring2017}
\item[\ra] won't address this here
\item[]
\item Using bidirectional RNNs \cite{bahdanau2014}
\item[\ra] most used method
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Bidirectional Encoder}
\begin{block}{}
\centering{
\only<1>{ \includegraphics[height=0.5\textheight]{figures_en/bidir_enc_1} \\
{\color{gray} [1.]} ~\emph{1-hot} vector + projection + update \alert{forward} hidden unit }
\only<2>{ \includegraphics[height=0.5\textheight]{figures_en/bidir_enc_2} \\
{\color{brown} [1bis.]} ~update \alert{backward} hidden unit }
\only<3>{ \includegraphics[height=0.5\textheight]{figures_en/bidir_enc_all} \\
{\color{brown} [2.]} \alert{Annotation} = concatenation of \alert{forward} and \alert{backward} vectors \\
{\small Every $\vm{h}_i$ encodes the full sentence with a focus on the \ith\ word} }
}
\end{block}
\vspace{.2cm}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Attention Mechanism}
\begin{block}{}
\begin{itemize}
\item How to process this matrix into the decoder?
\item Reminder: decoder is made of one (or several) recurrent units
\item[]
\item Solution: go back to a fixed size vector \textbf{dynamically}
\begin{itemize}
\item at each (\emph{timestep})
\end{itemize}
\item[]
\item[\ra] \textbf{Attention mechanism}
\item[]
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Attention mechanism}
\begin{block}{Before: sentence represented by a vector }
\centering{
\includegraphics[height=0.7\textheight]{figures_en/enc_dec_all}
}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Attention mechanism}
\begin{block}{After: sentence represented by a matrix}
\centering{
\includegraphics[height=0.7\textheight]{figures_en/dec_attention_0}
}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Attention mechanism}
%% \begin{itemize}
%% \item Constat : un vecteur de taille fixe est insuffisant pour représenter une phrase longue
%% \item[\ra] Idée : utilisons \alert{plusieurs} représentations !
%% %\item[\ra] Chaque représentations se concentre sur une partie de l'entrée
%% \end{itemize}
%
\begin{columns}
\column{0.5\textwidth}
\begin{block}{}
\begin{itemize}
\item<+-> {\color{brown} [2.]} ~Decoder gets the \alert{annotations} from encoder.
\item<+-> {\color{cyan} [3.]} ~ \alert{Attention weights} calculated with feedforward NN. \\
{\small ~ \ra ~ a \emph{mean} vector is calculated $\tilde{\vm{h_j}} = \ds \sum_i \alpha_{ij} \vm{h}_{ij}$ }
\item<+->{\color{brown} [4.]} Update weights of decoder GRU
\item<+-> {\color{purple} [5.]} Calculate probability distribution for \alert{all} words
\item<+-> {\color{orange} [6.]} Generate next word (most probable)
\end{itemize}
\end{block}
\column{0.5\textwidth}
\end{columns}
\begin{textblock*}{50mm}[0,0](62mm,9mm)
\only<1>{ \includegraphics[height=0.7\textheight]{figures_en/dec_attention_1} }
\only<2>{ \includegraphics[height=0.7\textheight]{figures_en/dec_attention_2} }
\only<3>{ \includegraphics[height=0.7\textheight]{figures_en/dec_attention_3} }
\only<4>{ \includegraphics[height=0.7\textheight]{figures_en/dec_attention_4} }
\only<5>{ \includegraphics[height=0.7\textheight]{figures_en/dec_attention_all} }
\end{textblock*}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Attention mechanism}
%\begin{block}{}
%\centering{
\begin{textblock*}{150mm}[0,0](30mm,10mm)
\only<1>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step1} }
\only<2>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step2} }
\only<3>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step3} }
\only<4>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step4} }
\only<5>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step5} }
\only<6>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step6} }
\only<7>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step8} }
\only<8>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_steplast} }
\end{textblock*}
%}
%\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{A word on gradients}
\begin{block}{Without attention mechanism: }
\centering{
\only<1>{ \includegraphics[height=0.6\textheight]{figures_en/enc_dec_all} }
\only<2>{ \includegraphics[height=0.6\textheight]{figures_en/dec_backprop} }
}
\begin{itemize}
\item<2-> Gradients go through the last encoder hidden state.
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{A word on gradients}
\begin{block}{With attention mechanism: }
\centering{
\only<1>{ \includegraphics[height=0.6\textheight]{figures_en/dec_attention_backprop} }
}
\begin{itemize}
\item Attention mechanism facilitate gradients propagation towards the encoder
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Attention and translation}
\begin{block}{ Some considerations/remarks :}
\begin{itemize}
\item Does a human translator memorise the whole source sentence and then proceed to translate?
\begin{itemize}
\item Compressing the sentence into a fixed size vector corresponds to that
\end{itemize}
\item A human translator goes back and forth between source sentence and current (partial) translation
\begin{itemize}
\item Attention mechanism allows that
\end{itemize}
\item[]
\item Should humans be a model for machines? that's another story...
\item[]
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Attention and translation}
\begin{columns}
\column{0.5\textwidth}
\begin{block}{Attention Mechanism \Ra\ alignment }
\begin{itemize}
\item For each produced word, a set of attention weights is created (set length is size of source sequence)
\item \textbf{Alignment} and translation models jointly trained!
\item[\ra] \cite{bahdanau2014}
\item[]
\end{itemize}
\end{block}
\column{0.5\textwidth}
\centering{
\includegraphics[height=0.6\textheight]{figures_en/attention_alignment}
}
\end{columns}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Summary}
\begin{block}{ }
\begin{itemize}
\item Attention
\begin{itemize}
\item allows to connect units far from each other
\end{itemize}
\item Standard attention models focus on the "content"
\begin{itemize}
\item some bias can be added
\item[\ra] for example, to have a better source word coverage
\end{itemize}
\item[]
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Algorithm}
\centering{
\includegraphics[height=0.8\textheight]{figures_en/dec_algo}
}
\begin{itemize}
\item[] \small{[From C. Dyer @ MTMarathon 2017]}
\end{itemize}
\end{frame}