Commit dc6d8314 by Loïc Barrault

### Sequence modelling CVC

parent 0e8f6748
 m2_DL_sequence_modeling_en.pdf
 %!TEX root = m2_DL_sequence_modeling_en.tex %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} \vspace{\stretch{1}} \begin{block}{} \begin{itemize} \item[] \huge {Conditional Language Models} \end{itemize} \end{block} \vspace{\stretch{1}} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Reminder: RNNLM} \centering \includegraphics[width=0.55\textwidth]{figures_en/rnn_unrolled_all} \begin{itemize} \item<+-> Probability of a word sequence $\vw = (w_1, w_2, ..., w_\ell)$ \item[]{ \small{ $p(\vw) = p(w_1) \times p(w_2|w_1) \times p(w_3 | w_1, w_2) \times \dots \times p(w_l | w_1, ..., w_{\ell-1}) = \ds \prod_{t=1}^{\ell} p(w_t|w_1, ..., w_{t-1})$ } } % \item[]<2>{ \small {$p(\vw) = \ds \prod_{t=1}^{\ell} p(w_t|w_1, ..., w_{t-1})$ } } \item[\ra]<+-> \alert{Probability only depends on the history} \item<+-> Can we use it to do machine translation or automatic summarization? \item[\ra]<+-> We have to take source text into account! \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Conditional Language Models} \begin{block}{Conditional Probability given a context $\edinred{\vm{x}}$} \begin{itemize} \item[] \item[]{ \small{ $p(\vw | \edinred{\vm{x}}) = \ds \prod_{t=1}^{\ell} p(w_t| \edinred{\vm{x}}, w_1, ..., w_{t-1})$ } } \item[\ra] Probability of the next word given history \textbf{and} source context $\edinred{\vm{x}}$ ? \item[] \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Conditional Language Models} %\begin{block}{Applications} \rowcolors[]{1}{gray!20}{gray!10} % \rowcolors is in the xcolor manual \centering \begin{tabular}{l|l} % \vrule is to Add rule. can use | Input $\edinred{\vm{x}}$ & Output $\vm{w}$ \\\midrule % \midrule instead of the old \hline English sentence & Its Chinese translation \\ Sentence in Chinese & Its French Translation \\ An image & Its description \\ Speech signal & Its transcription \\ History of a dialogue & The answer of the dialogue system \\ A document + a question & The answer \\ An image + a question & The answer \Ra\ VQA \\ A document & A summary \\ ... & ... \\ \end{tabular} %\end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Which data?} %\begin{block}{} \begin{itemize} \item to train conditional LM, we need \textbf{annotated data} \rowcolors[]{1}{gray!20}{gray!10} % \rowcolors is in the xcolor manual { \centering \begin{tabular}{l|l} Translation & Bilingual Texts (with sentence level alignment) \\ Automatic Speech Recognition & Speech signal + manual transcription \\ Image captioning & Images + their descriptions \\ ... & ... \\ \end{tabular}} \item Data quantity and quality may vary \end{itemize} %\end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{What follows:} %\begin{block}{} \begin{itemize} \item How to represent the source sequence with a fixed size vector $\edinred{\vm{x}}$ ? \begin{itemize} \item Problem specific (or at least modality specific). %\item (Début de) réponse au cours précédent \end{itemize} \item[] \item How to condition the decoder with $\edinred{\vm{x}}$ ? \begin{itemize} \item Less problem specific \item Some architectures in this course \end{itemize} \end{itemize} %\end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Neural Model} \begin{block}{Sequence to sequence} \begin{itemize} \item Current approach: encoder/decoder \item The input sequence is encoded into a low dimension vector (few hundreds) \item[\ra] \alert{Encoder} \item Output sequence is generated: \begin{itemize} \item with a conditional LM \item using a fixed size vector from the encoder \item[\ra] word by word, stop when end token is generated ("$\langle /s\rangle$") \end{itemize} \item[\ra] \alert{Decoder} \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Back to the encoder} \begin{block}{How to represent the source sequence with a fixed size vector $\edinred{\vm{x}}$ ? } \begin{itemize} \item Previous part: RNN, GRU, LSTM \item What about this architecture? \\ {\centering \includegraphics[width=0.35\textwidth]{figures_en/bow} \item[]} \item<2> \textbf{Bag of words} representation \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Back to the encoder} \begin{block}{\cite{kalchbrenner2013} } \begin{itemize} \item[] { \centering \includegraphics[width=0.35\textwidth]{figures_en/conv_sent_encoder} \item[] } \item<+-> \edinred{Convolutional} encoder \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{How to condition on $\vm{x}$ ?} \begin{block}{\cite{kalchbrenner2013} } \centering \includegraphics[width=0.95\textwidth]<+>{figures_en/rnn_unrolled_4} \includegraphics[width=0.95\textwidth]<+>{figures_en/cond_rnn_unrolled_1} \includegraphics[width=0.95\textwidth]<+>{figures_en/cond_rnn_unrolled_2} \includegraphics[width=0.95\textwidth]<+>{figures_en/cond_rnn_unrolled_3} \includegraphics[width=0.95\textwidth]<+>{figures_en/cond_rnn_unrolled_all} \includegraphics[width=0.55\textwidth]<+>{figures_en/cond_rnn_unrolled_all} \begin{itemize} \item[]<.-> $\vm{h}_t = \phi(\vm{M}[\vm{h}_{t-1}; \vm{w}_{t-1}] \edinred{+ \vm{x}} + \vm{b})$ \item[]<.-> $\vm{z}_t = \vm{S}~\vm{h}_{t} + \vm{b'}$ \item[]<.-> $p(\vm{w}_t | \edinred{\vm{x}}, \vm{w}_{{ {\color{edinred} [1.]} ~Word encoded into \emph{1-hot} vector } \item<3->{ {\color{cyan} [2.]} Projection into an \textbf{\textit{embedding}} } \item<4->{ {\color{gray} [3.]} Update hidden state of \emph{\alert{encoder}} RNN } \item[\ra]<5->{ {\color{liumgreen} [4.]} Sentence representation } \item<6->{ {\color{edingreen} [5.]} Update hidden state of \emph{\alert{decoder}} RNN } \begin{itemize} \item<.-> previous state, source context$\vm{c}$, feedback \end{itemize} \item<7->{ {\color{purple} [6.]} Calculate probability distribution for \alert{all} words } \item<8->{ {\color{orange} [7.]} Next word (most probable) } \end{itemize} \end{block} \column{0.5\textwidth} \\ \centering{ \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_1} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_2} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_3} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_4} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_5} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_6} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_7} }%centering \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Results} \begin{block}{} \centerline{ \includegraphics[width=0.8\textwidth]{figures_en/nmt_sentence_length} } \begin{itemize} \item[\ra] Translation score decreases with sentence size ! \item How to explain that? \only<1>{ \begin{enumerate} \item Not enough data in training corpus? \item Difficult to generate a long coherent sentence? \item Fixed size vector insufficient to encode a long sentence? \end{enumerate} } \only<2>{ \begin{enumerate} \item \sout{Not enough data in training corpus?} \item \sout{Difficult to generate a long coherent sentence?} \item \edinred{Fixed size vector insufficient to encode a long sentence !!!} \end{enumerate} } \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{RNNs for sentence representation} \begin{block}{Conclusion} \begin{itemize} \item Pros: \begin{itemize} \item RNNs can naturally manage variable length sequences \item GRU and LSTMs can \textbf{in principle} propagate gradients through a long chain of non-linearities \item Very simple architecture in the end! \end{itemize} \item[] \item Cons: \begin{itemize} \item Hidden state must memorize a large quantity of information! \end{itemize} \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{RNNs for sentence representation} \begin{block}{} \begin{itemize} \item A large quantity of data is compressed into a fixed size vector \item Gradients have a long way to go \begin{itemize} \item Even LSTMs forget! \end{itemize} \item[] \item Prof. Ray Mooney (U. Texas) : \item[] "You can't cram the meaning of a whole \$\^{}!\# sentence into a single \$\#!\% vector!" \item[] \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{From vector to matrix representation} \begin{block}{} \begin{itemize} \item Represent input sequence with a matrix \item Generate output sequence using the matrix \item[] \item[\ra] Solve the problem of capacity \item[\ra] we also refer to model \textbf{expressivity} \item[] \item[\ra] Solve the problem of gradient stream \item[] \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Representing sentences with a matrix} \begin{block}{} \begin{itemize} \item Fixed size vector: regardless the input sequence size \item[] \item Matrix : number of rows is fixed \item[\ra] number of characteristics (\textbf{features}) \item Number of columns = number of words in the source sequence (in general) \item[] \item[\ra] How to build this matrix? \item[] \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Concatenation} \begin{block}{} \begin{itemize} \item Concatenation of word embeddings \item simplest possible model % \item[\ra] no paper published so far, any volunteers? \item[] \item Using convolutional networks \cite{gehring2017} \item[\ra] won't address this here \item[] \item Using bidirectional RNNs \cite{bahdanau2014} \item[\ra] most used method \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Bidirectional Encoder} \begin{block}{} \centering{ \only<1>{ \includegraphics[height=0.5\textheight]{figures_en/bidir_enc_1} \\ {\color{gray} [1.]} ~\emph{1-hot} vector + projection + update \alert{forward} hidden unit } \only<2>{ \includegraphics[height=0.5\textheight]{figures_en/bidir_enc_2} \\ {\color{brown} [1bis.]} ~update \alert{backward} hidden unit } \only<3>{ \includegraphics[height=0.5\textheight]{figures_en/bidir_enc_all} \\ {\color{brown} [2.]} \alert{Annotation} = concatenation of \alert{forward} and \alert{backward} vectors \\ {\small Every$\vm{h}_i$encodes the full sentence with a focus on the \ith\ word} } } \end{block} \vspace{.2cm} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Attention Mechanism} \begin{block}{} \begin{itemize} \item How to process this matrix into the decoder? \item Reminder: decoder is made of one (or several) recurrent units \item[] \item Solution: go back to a fixed size vector \textbf{dynamically} \begin{itemize} \item at each (\emph{timestep}) \end{itemize} \item[] \item[\ra] \textbf{Attention mechanism} \item[] \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Attention mechanism} \begin{block}{Before: sentence represented by a vector } \centering{ \includegraphics[height=0.7\textheight]{figures_en/enc_dec_all} } \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Attention mechanism} \begin{block}{After: sentence represented by a matrix} \centering{ \includegraphics[height=0.7\textheight]{figures_en/dec_attention_0} } \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Attention mechanism} %% \begin{itemize} %% \item Constat : un vecteur de taille fixe est insuffisant pour représenter une phrase longue %% \item[\ra] Idée : utilisons \alert{plusieurs} représentations ! %% %\item[\ra] Chaque représentations se concentre sur une partie de l'entrée %% \end{itemize} % \begin{columns} \column{0.5\textwidth} \begin{block}{} \begin{itemize} \item<+-> {\color{brown} [2.]} ~Decoder gets the \alert{annotations} from encoder. \item<+-> {\color{cyan} [3.]} ~ \alert{Attention weights} calculated with feedforward NN. \\ {\small ~ \ra ~ a \emph{mean} vector is calculated$\tilde{\vm{h_j}} = \ds \sum_i \alpha_{ij} \vm{h}_{ij}\$ } \item<+->{\color{brown} [4.]} Update weights of decoder GRU \item<+-> {\color{purple} [5.]} Calculate probability distribution for \alert{all} words \item<+-> {\color{orange} [6.]} Generate next word (most probable) \end{itemize} \end{block} \column{0.5\textwidth} \end{columns} \begin{textblock*}{50mm}[0,0](62mm,9mm) \only<1>{ \includegraphics[height=0.7\textheight]{figures_en/dec_attention_1} } \only<2>{ \includegraphics[height=0.7\textheight]{figures_en/dec_attention_2} } \only<3>{ \includegraphics[height=0.7\textheight]{figures_en/dec_attention_3} } \only<4>{ \includegraphics[height=0.7\textheight]{figures_en/dec_attention_4} } \only<5>{ \includegraphics[height=0.7\textheight]{figures_en/dec_attention_all} } \end{textblock*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Attention mechanism} %\begin{block}{} %\centering{ \begin{textblock*}{150mm}[0,0](30mm,10mm) \only<1>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step1} } \only<2>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step2} } \only<3>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step3} } \only<4>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step4} } \only<5>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step5} } \only<6>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step6} } \only<7>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_step8} } \only<8>{ \includegraphics[height=0.4\textwidth]{figures_en/dec_attention_steplast} } \end{textblock*} %} %\end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{A word on gradients} \begin{block}{Without attention mechanism: } \centering{ \only<1>{ \includegraphics[height=0.6\textheight]{figures_en/enc_dec_all} } \only<2>{ \includegraphics[height=0.6\textheight]{figures_en/dec_backprop} } } \begin{itemize} \item<2-> Gradients go through the last encoder hidden state. \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{A word on gradients} \begin{block}{With attention mechanism: } \centering{ \only<1>{ \includegraphics[height=0.6\textheight]{figures_en/dec_attention_backprop} } } \begin{itemize} \item Attention mechanism facilitate gradients propagation towards the encoder \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Attention and translation} \begin{block}{ Some considerations/remarks :} \begin{itemize} \item Does a human translator memorise the whole source sentence and then proceed to translate? \begin{itemize} \item Compressing the sentence into a fixed size vector corresponds to that \end{itemize} \item A human translator goes back and forth between source sentence and current (partial) translation \begin{itemize} \item Attention mechanism allows that \end{itemize} \item[] \item Should humans be a model for machines? that's another story... \item[] \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Attention and translation} \begin{columns} \column{0.5\textwidth} \begin{block}{Attention Mechanism \Ra\ alignment } \begin{itemize} \item For each produced word, a set of attention weights is created (set length is size of source sequence) \item \textbf{Alignment} and translation models jointly trained! \item[\ra] \cite{bahdanau2014} \item[] \end{itemize} \end{block} \column{0.5\textwidth} \centering{ \includegraphics[height=0.6\textheight]{figures_en/attention_alignment} } \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Summary} \begin{block}{ } \begin{itemize} \item Attention \begin{itemize} \item allows to connect units far from each other \end{itemize} \item Standard attention models focus on the "content" \begin{itemize} \item some bias can be added \item[\ra] for example, to have a better source word coverage \end{itemize} \item[] \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Algorithm} \centering{ \includegraphics[height=0.8\textheight]{figures_en/dec_algo} } \begin{itemize} \item[] \small{[From C. Dyer @ MTMarathon 2017]} \end{itemize} \end{frame}

98.1 KB

111 KB

417 KB