Commit 6ea31fce by Loïc Barrault

### more printing friendly

parent cd08954c
 ... ... @@ -16,8 +16,9 @@ \begin{frame} \frametitle{Reminder: RNNLM} \centering \centerline{ \includegraphics[width=0.55\textwidth]{figures_en/rnn_unrolled_all} } \begin{itemize} \item<+-> Probability of a word sequence $\vw = (w_1, w_2, ..., w_\ell)$ \item[]{ \small{ $p(\vw) = p(w_1) \times p(w_2|w_1) \times p(w_3 | w_1, w_2) \times \dots \times p(w_l | w_1, ..., w_{\ell-1}) = \ds \prod_{t=1}^{\ell} p(w_t|w_1, ..., w_{t-1})$ } } ... ... @@ -125,52 +126,55 @@ A document & A summary \\ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Back to the encoder} \begin{block}{How to represent the source sequence with a fixed size vector $\edinred{\vm{x}}$ ? } \textbf{How to represent the source sequence with a fixed size vector $\edinred{\vm{x}}$ ? } \begin{itemize} \item Previous part: RNN, GRU, LSTM \item What about this architecture? \\ {\centering \centerline{ \includegraphics[width=0.35\textwidth]{figures_en/bow} \item[]} } \item<2> \textbf{Bag of words} representation \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Back to the encoder} \begin{block}{\cite{kalchbrenner2013} } \textbf{\cite{kalchbrenner2013}} \vfill \begin{itemize} \item[] { \centering \includegraphics[width=0.35\textwidth]{figures_en/conv_sent_encoder} \item[] \centerline{ \includegraphics[width=0.35\textwidth]{figures_en/conv_sent_encoder} } \vfill \item<+-> \edinred{Convolutional} encoder \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{How to condition on $\vm{x}$ ?} \begin{block}{\cite{kalchbrenner2013} } \centering \includegraphics[width=0.95\textwidth]<+>{figures_en/rnn_unrolled_4} \includegraphics[width=0.95\textwidth]<+>{figures_en/cond_rnn_unrolled_1} \includegraphics[width=0.95\textwidth]<+>{figures_en/cond_rnn_unrolled_2} \includegraphics[width=0.95\textwidth]<+>{figures_en/cond_rnn_unrolled_3} \includegraphics[width=0.95\textwidth]<+>{figures_en/cond_rnn_unrolled_all} \includegraphics[width=0.55\textwidth]<+>{figures_en/cond_rnn_unrolled_all} \textbf{\cite{kalchbrenner2013}} \centerline{ \includegraphics[width=0.95\textwidth]{figures_en/rnn_unrolled_4}<+> \includegraphics[width=0.95\textwidth]{figures_en/cond_rnn_unrolled_1}<+> \includegraphics[width=0.95\textwidth]{figures_en/cond_rnn_unrolled_2}<+> \includegraphics[width=0.95\textwidth]{figures_en/cond_rnn_unrolled_3}<+> \includegraphics[width=0.95\textwidth]{figures_en/cond_rnn_unrolled_all}<+> \includegraphics[width=0.55\textwidth]{figures_en/cond_rnn_unrolled_all}<+> } \begin{itemize} \item[]<.-> $\vm{h}_t = \phi(\vm{M}[\vm{h}_{t-1}; \vm{w}_{t-1}] \edinred{+ \vm{x}} + \vm{b})$ \item[]<.-> $\vm{z}_t = \vm{S}~\vm{h}_{t} + \vm{b'}$ \item[]<.-> $p(\vm{w}_t | \edinred{\vm{x}}, \vm{w}_{{ {\color{edinred} [1.]} ~Word encoded into \emph{1-hot} vector } \item<3->{ {\color{cyan} [2.]} Projection into an \textbf{\textit{embedding}} } ... ... @@ -219,18 +223,16 @@ A document & A summary \\ \item<8->{ {\color{orange} [7.]} Next word (most probable) } \end{itemize} \end{block} \column{0.5\textwidth} \\ \centering{ \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_1} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_2} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_3} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_4} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_5} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_6} \includegraphics[height=0.8\textwidth]<+>{figures_en/enc_dec_all_7} \includegraphics[height=0.8\textwidth]{figures_en/enc_dec_all}<+> \includegraphics[height=0.8\textwidth]{figures_en/enc_dec_all_1}<+> \includegraphics[height=0.8\textwidth]{figures_en/enc_dec_all_2}<+> \includegraphics[height=0.8\textwidth]{figures_en/enc_dec_all_3}<+> \includegraphics[height=0.8\textwidth]{figures_en/enc_dec_all_4}<+> \includegraphics[height=0.8\textwidth]{figures_en/enc_dec_all_5}<+> \includegraphics[height=0.8\textwidth]{figures_en/enc_dec_all_6}<+> \includegraphics[height=0.8\textwidth]{figures_en/enc_dec_all_7}<+> }%centering \end{columns} ... ... @@ -240,7 +242,6 @@ A document & A summary \\ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Results} \begin{block}{} \centerline{ \includegraphics[width=0.8\textwidth]{figures_en/nmt_sentence_length} } ... ... @@ -260,7 +261,6 @@ A document & A summary \\ \end{enumerate} } \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ... ... @@ -306,7 +306,7 @@ A document & A summary \\ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{From vector to matrix representation} \begin{block}{} \begin{itemize} \item Represent input sequence with a matrix \item Generate output sequence using the matrix ... ... @@ -317,13 +317,13 @@ A document & A summary \\ \item[\ra] Solve the problem of gradient stream \item[] \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Representing sentences with a matrix} \begin{block}{} \begin{itemize} \item Fixed size vector: regardless the input sequence size \item[] ... ... @@ -334,13 +334,13 @@ A document & A summary \\ \item[\ra] How to build this matrix? \item[] \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Concatenation} \begin{block}{} \begin{itemize} \item Concatenation of word embeddings \item simplest possible model ... ... @@ -352,14 +352,14 @@ A document & A summary \\ \item Using bidirectional RNNs \cite{bahdanau2014} \item[\ra] most used method \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Bidirectional Encoder} \begin{block}{} \centering{ \only<1>{ \includegraphics[height=0.5\textheight]{figures_en/bidir_enc_1} \\ {\color{gray} [1.]} ~\emph{1-hot} vector + projection + update \alert{forward} hidden unit } ... ... @@ -369,14 +369,14 @@ A document & A summary \\ {\color{brown} [2.]} \alert{Annotation} = concatenation of \alert{forward} and \alert{backward} vectors \\ {\small Every$\vm{h}_iencodes the full sentence with a focus on the \ith\ word} } } \end{block} \vspace{.2cm} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Attention Mechanism} \begin{block}{} \begin{itemize} \item How to process this matrix into the decoder? \item Reminder: decoder is made of one (or several) recurrent units ... ... @@ -389,28 +389,26 @@ A document & A summary \\ \item[\ra] \textbf{Attention mechanism} \item[] \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Attention mechanism} \begin{block}{Before: sentence represented by a vector } \centering{ \textbf{Before: sentence represented by a vector } \centerline{ \includegraphics[height=0.7\textheight]{figures_en/enc_dec_all} } \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Attention mechanism} \begin{block}{After: sentence represented by a matrix} \centering{ \textbf{After: sentence represented by a matrix} \centerline{ \includegraphics[height=0.7\textheight]{figures_en/dec_attention_0} } \end{block} \end{frame} ... ... @@ -425,7 +423,7 @@ A document & A summary \\ % \begin{columns} \column{0.5\textwidth} \begin{block}{} \begin{itemize} \item<+-> {\color{brown} [2.]} ~Decoder gets the \alert{annotations} from encoder. \item<+-> {\color{cyan} [3.]} ~ \alert{Attention weights} calculated with feedforward NN. \\ ... ... @@ -435,7 +433,7 @@ A document & A summary \\ \item<+-> {\color{purple} [5.]} Calculate probability distribution for \alert{all} words \item<+-> {\color{orange} [6.]} Generate next word (most probable) \end{itemize} \end{block} \column{0.5\textwidth} \end{columns} ... ... @@ -474,35 +472,33 @@ A document & A summary \\ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{A word on gradients} \begin{block}{Without attention mechanism: } \centering{ \textbf{Without attention mechanism: } \centerline{ \only<1>{ \includegraphics[height=0.6\textheight]{figures_en/enc_dec_all} } \only<2>{ \includegraphics[height=0.6\textheight]{figures_en/dec_backprop} } } \begin{itemize} \item<2-> Gradients go through the last encoder hidden state. \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{A word on gradients} \begin{block}{With attention mechanism: } \centering{ \textbf{With attention mechanism: } \centerline{ \only<1>{ \includegraphics[height=0.6\textheight]{figures_en/dec_attention_backprop} } } \begin{itemize} \item Attention mechanism facilitate gradients propagation towards the encoder \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Attention and translation} \begin{block}{ Some considerations/remarks :} \textbf{Some considerations/remarks:} \begin{itemize} \item Does a human translator memorise the whole source sentence and then proceed to translate? \begin{itemize} ... ... @@ -516,7 +512,6 @@ A document & A summary \\ \item Should humans be a model for machines? that's another story... \item[] \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ... ... @@ -524,14 +519,13 @@ A document & A summary \\ \frametitle{Attention and translation} \begin{columns} \column{0.5\textwidth} \begin{block}{Attention Mechanism \Ra\ alignment } \textbf{Attention Mechanism \Ra\ alignment} \begin{itemize} \item For each produced word, a set of attention weights is created (set length is size of source sequence) \item \textbf{Alignment} and translation models jointly trained! \item[\ra] \cite{bahdanau2014} \item[] \end{itemize} \end{block} \column{0.5\textwidth} \centering{ ... ... @@ -543,7 +537,6 @@ A document & A summary \\ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Summary} \begin{block}{ } \begin{itemize} \item Attention \begin{itemize} ... ... @@ -556,13 +549,12 @@ A document & A summary \\ \end{itemize} \item[] \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Algorithm} \centering{ \centerline{ \includegraphics[height=0.8\textheight]{figures_en/dec_algo} } \begin{itemize} ... ... File added File added  ... ... @@ -52,8 +52,6 @@ \usepackage[english]{babel} \usepackage[utf8]{inputenc} \usepackage{times} \usepackage{epsfig} \usepackage{comment} ... ... @@ -68,9 +66,10 @@ %\usepackage{xspace} %\usepackage{amsmath} \input ../macros.tex \input ../macros_en.tex \input ../macros_beamer.tex \input ../mycolors.tex \usepackage[absolute,showboxes,overlay]{textpos} %\TPshowboxestrue % commenter une fois fini ... ... @@ -168,7 +167,6 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Motivations} \begin{block}{} \begin{itemize} \item Many problems can reduce to transforming one sequence into another: \begin{itemize} ... ... @@ -190,7 +188,6 @@ \item image = pixel sequence (eventually 2D) \end{itemize} \end{itemize} \end{block} \end{frame} ... ... @@ -230,7 +227,7 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Summary} \begin{block}{Sentence representations} \textbf{Sentence representations} \begin{itemize} \item Fixed size vector from an RNN \item Matrix + attention mechanism ... ... @@ -242,17 +239,17 @@ \item[] \end{itemize} \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Attention Mechanism} \begin{block}{Image captioning \cite{xu2015showattendtell}} \centering{ \textbf{Image captioning \cite{xu2015showattendtell}} \centerline{ \includegraphics[height=0.75\textheight]{figures/img_caption_1} } \end{block} \end{frame} ... ...  ... ... @@ -12,30 +12,32 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Language Modelling} \begin{block}{Reminder: language modelling } \frametitle{Reminder: Language Modelling} \begin{itemize} \item A language model (LM) assigns a non-zero probability to a word sequence\vw = (w_1, w_2, ..., w_\ell)$\begin{eqnarray} p(\vw) & = & p(w_1) \times p(w_2|w_1) \times p(w_3 | w_1, w_2) \times \dots \times \nonumber \\ & & ~~~~~~ p(w_l | w_1, ..., w_{\ell-1}) \nonumber\\ & = & \prod_{t=1}^{\ell} p(w_t|w_1, ..., w_{t-1}) \nonumber \end{eqnarray} \end{itemize} \begin{itemize} \item Modelling language is done by {\bf modelling the probability of the next word } given the history of previous words. \item In practice: reduce history so that it is tractable and relevant (Markov hypothesis) \ra\ n-gram \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Neural Language Model} \begin{block}{Reminder: Feed-forward neural LM} \begin{itemize} % \item Toujours un modèle n-gram \item$ p(w_i | w_{i-n}, \cdots, w_{i-1}) \approx f(w_{i-n}, \cdots, w_{i-1}) $\item$f$: function estimating probability of word$w_i$from the$n-1$previous words \ra\ learn with a NN \end{itemize} \end{block} \centerline{ \includegraphics[width=0.30\textwidth]{figures_en/fflm_all} } ... ... @@ -45,7 +47,7 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Neural Language Model} \begin{block}{Feedforward NN} \textbf{Feedforward NN} \begin{description} \item[1.] Word representation with '\alert{1-hot}' vector \item[\ra]$ w_j = \left[ 0, \cdots, 0, 1, 0, \cdots, 0 \right]^\top $(1 at position$j$) ... ... @@ -61,7 +63,7 @@ \item[\ra]$\vm{d} = \phi ( \vm{U}^\top \vm{c} + \vm{b_U}) $with$\vm{b_U}$the bias \item[\ra]$\phi$: non-linear activation function (tanh) \end{description} \end{block} \begin{textblock*}{40mm}[0,0](93mm,20mm) \includegraphics[height=5cm]{figures_en/fflm_proj} \end{textblock*} ... ... @@ -70,7 +72,7 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Neural Language Model} \begin{block}{Feedforward NN} \textbf{Feedforward NN} %\begin{varblock}[7cm]{RdN Feedforward} \begin{description} \item[5 .] Calculate non-normalized score ... ... @@ -83,7 +85,7 @@ \item[] with$z_j$the \jth\ element of$\vm{z}$and$\|V\|$the vocabulary size \item[] \end{description} \end{block} %\end{varblock} \begin{textblock*}{30mm}[0,0](90mm,12mm) \includegraphics[height=4.5cm]{figures_en/fflm_estim} ... ... @@ -93,7 +95,7 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Neural Language Model} \begin{block}{} \begin{itemize} % \item Entraînement par back-propagation du gradient d'erreur \item Maximum likelihood + backprop ... ... @@ -109,7 +111,7 @@ \item$\vm{d}$: sentence representation? \item[] \end{itemize} \end{block} \begin{textblock*}{30mm}[0,0](78mm,23mm) \includegraphics[height=4.5cm]{figures_en/fflm_all} \end{textblock*} ... ... @@ -118,23 +120,22 @@ %--------------------------------------------------------- \begin{frame} \frametitle{Embeddings} \begin{figure} \centering \includegraphics[height=6cm]{figures_en/Turian-WordTSNE_all}% \onslide<2->{ \llap{\raisebox{1cm}{% move next graphics to top right corner \centerline{ \includegraphics[width=0.6\textwidth]{figures_en/Turian-WordTSNE} } }}} \end{figure} \centering \includegraphics[height=\textheight]{figures_en/Turian-WordTSNE_all}% \begin{textblock*}{90mm}[0,0](40mm,10mm) \only<2>{ \includegraphics[width=.9\textwidth]{figures_en/Turian-WordTSNE}% } \end{textblock*} \end{frame} %--------------------------------------------------------- \begin{frame} \frametitle{Why does it work?} \begin{block}{} \begin{itemize} \item Better estimation for n-grams unseen in training corpus \item[\ra] backoff LM: reduce history size + weighting ... ... @@ -151,13 +152,14 @@ \item[] What is the probability that \edinred{10} is followed by \edinorange{dollars}? \item[] \end{itemize} \end{block} \end{frame} %--------------------------------------------------------- \begin{frame} \begin{block}{} \frametitle{} \begin{itemize} \item[] What is the probability that \edinred{10} is followed by \edinorange{dollars}? \item[] ... ... @@ -173,7 +175,7 @@ % \item[] \end{itemize} \end{block} \end{frame} %--------------------------------------------------------- ... ... @@ -181,7 +183,6 @@ \begin{frame} \frametitle{} \begin{block}{} \begin{itemize} \item Can we free the model from Markov property? \item[\ra] Non-Markovian model ... ... @@ -196,7 +197,7 @@ \item<2> Solution: \alert{compress history!} % \item[] \end{itemize} \end{block} \end{frame} %------------------------------------------------------------------------------------------------------------ ... ... @@ -205,12 +206,11 @@ \begin{frame} \frametitle{Recurrent Neural Networks} \begin{block}{} \begin{itemize} \item Problem: \textbf{sentences are of variable-length, not bounded!} \item Solution: \alert{compress history!} \item[] \Ra\ Solution: \alert{compress history!} \end{itemize} \end{block} \begin{block}{Protocol} \begin{enumerate} \item Initialise history$\vm{h}$... ... @@ -218,6 +218,7 @@ \item Predict next word$w_{i+1}$using$\vm{h}_i$\end{enumerate} \end{block} \end{frame} ... ... @@ -226,7 +227,6 @@ \frametitle{Recurrent Neural Networks} %\vspace{-.5cm} \begin{block}{} \begin{description} \item[1. \& 2.] 1-hot vectors + project$w_i$into continuous space \item[\ra]$ \vm{c}_i = \vm{W} ^\top w_i \in \mathbb{R}^d \$ \\ ... ... @@ -245,7 +245,6 @@ \item[] \item[] \end{description} \end{block}