 ... ... @@ -92,9 +92,9 @@ %\input{ie_plan.tex} %\input{ie_introduction.tex} % 4th lecture + start 5th lecture %\input{ie_ner.tex} % end of 5th lecture + 6th lecture %\input{ie_relation_extraction.tex} % 7th lecture %\input{ie_relation_extraction.tex} % 7th lecture + 8th lecture (shorter) \input{textproc_nn.tex} \input{textproc_nn.tex} % 9th lecture + eventually 10th \input{sa_extra_reading} ... ...
 ... ... @@ -82,7 +82,7 @@ Hebb: \myemph{Neurons that fire together, wire together''} \vspace{.5cm} Training method: change the weights $\vw$ if a training example $\vx$ is misclassified as follows: \begin{itemize} \item[] $\hat{\vw}^{new} = \hat{\vw}^{cur} + \hat{\vx} . y$ ~~~ with ~~~ $y \in \{+1, -1\}$ \item[] $\vw^{new} = \vw^{cur} + \vx . y$ ~~~ with ~~~ $y \in \{+1, -1\}$ \end{itemize} \end{frame} ... ... @@ -136,39 +136,407 @@ y_i^{c} & = & f \left(\sum_j w^{c-1}_{ij} ~ y_j^{c-1}\right) \\ \frametitle{How to train a multilayer perceptron?} \begin{block}{\center \textbf{Backpropagation}} \begin{block}{\center \myemphb{Backpropagation}: Backward propagation of errors} %\begin{center} \begin{columns} \begin{column}{.5\textwidth} $\wij^{new} = \wij^{cur} - \lambda \frac{\partial E}{\partial \wij}$ \end{column} \begin{column}{.5\textwidth} \begin{itemize} \item $E$: \textbf{loss function} \item $\lambda$: \textbf{learning rate} \item $\wij$: weight between neuron $i$ and $j$ \end{itemize} \end{column} \end{columns} %\end{center} \end{block} \begin{itemize} \item Error function depending on the task \item Classification task \Ra\ estimate a probability distribution $\begin{array}[t]{rcl@{\hspace{1cm}}rcl} y_i & = & \ds \frac{e^{a_i}}{\sum_k e^{a_k}} & \ds {\partial y_i} / {\partial a_k} & = & \delta_{ik}y_i - y_i y_k \\[10pt] \ds E(\vy,\vc) & = & \ds \sum_i c_i \log y_i & \ds {\partial E} / {\partial y_i} & = & \ds \frac{c_i}{y_i} \end{array}$ \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{How to train a multilayer perceptron?} \begin{columns}[c] \begin{column}{.5\textwidth} \begin{block}{\center \myemphb{Chain rule}} \begin{center} $\ds \frac{\partial \mathbf{E}}{\partial \mathbf{W}} = \frac{\color{liumgreen} \partial \mathbf{E}}{\color{edinorange} \partial \mathbf{h^{2}}} \frac{\color{edinorange} \partial \mathbf{h^{2}}}{\color{cyan} \partial \mathbf{h^{1}}} \frac{\color{cyan} \partial \mathbf{h^{1}}}{\partial \mathbf{W}}$ \end{center} \end{block} \end{column} \begin{column}{.5\textwidth} \begin{center} \includegraphics[width=4cm]{mlp_bp_grad} \end{center} \end{column} \end{columns} \textbf{Output layer} $\ds \frac{\partial E}{\partial \wij} = \ds \underbrace{\frac{\partial E}{\partial a_i}}_{\delta_i} \, \frac{\partial a_i}{\partial \wij} = \delta_i \, h_j \text{~~with~~} \delta_i = \ds \frac{\partial E}{\partial y_i} \, \frac{\partial y_i}{\partial a_i} = \ds \frac{\partial E}{\partial y_i} \, f^{~'}(a_i)$ \textbf{Hidden layer} $\ds \frac{\partial E}{\partial v_{jk}} = \ds \underbrace{\frac{\partial E}{\partial z_j}}_{\gamma_j} \, \frac{\partial z_j}{\partial v_{jk}} = \gamma_j \,x_k \text{~~with~~} \gamma_j = \ds \sum_i \frac{\partial E}{\partial a_i} \, \frac{\partial a_i}{\partial h_j} \, \frac{\partial h_j}{\partial z_j} = \ds \sum_i \delta_i \, \wij \, f^{~'}(z_j) = f^{~'}(z_j) \ds \sum_i \delta_i \wij$ \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Multilayer perceptron: training} \begin{itemize} \item[1.] Normalise data \item[2.] Initialise the weights $\mW$ \item[3.] \alert{Repeat} \begin{itemize} \item Pick a \textbf{batch} of examples $(\vx,\vc)$ \item \textbf{Forward} pass: propagate the batch $\vx$ through the network \ra\ $\vy$ \item Calculate the error $E(\vy,\vc)$ \item \textbf{Backward} pass: \myemphb{backpropagation} \ra\ $\nabla \wij$ \item Update weights $\wij^{new} = \wij^{cur} - \lambda \frac{\partial E}{\partial \wij}$ \item Eventually change the training meta-parameters (e.g. learning rate $\lambda$) \end{itemize} \item[ ] \alert{until convergence} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{} \vfill \centering \Huge{\liumcyan{That's great, but where is the text?!?}} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{How to represent words?} \begin{block}{\center \myemphb{Word Embedding}} \begin{center} \Large{Backward propagation of errors} Vector representation of a word \Ra\ vector of real values\\ \end{center} \end{block} Also called continuous space representation. \begin{itemize} \item<2-> What would be the simplest way of obtaining vectors? \only<3->{\Ra\ so-called \myemphb{1-hot vector}:} \item[]<3-> \begin{itemize} \item vector of size equal to \textbf{vocabulary size} \item contains 0 everywhere except for a single 1 at a specific position \end{itemize} \vspace{1cm} \item<4-> Is that a good representation? \only<5->{\Ra\ \textbf{NO!}} \item[]<5-> \begin{itemize} \item distance between any two words is the same for all word pairs \item position of the "1" arbitrarily \item \ra\ it is just a \textbf{coding} \end{itemize} \end{itemize} \only<3->{ \begin{textblock*}{50mm}[0,0](105mm,40mm) \includegraphics[width=4cm]{one-hot} \end{textblock*} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{How to represent words?} \myemph{The semantic properties of the words are encoded in the dimensions of the vector} \begin{minipage}[t][.7\textheight]{\textwidth} \centering \includegraphics[width=.7\textwidth]{king-white-embedding}<1-> \includegraphics[width=.7\textwidth]{king-colored-embedding}<2-> \includegraphics[width=.4\textwidth]{queen-woman-girl-embeddings}<3-> \end{minipage} \vfill \source{ \textbf{\url{http://jalammar.github.io/illustrated-word2vec/}} \la\ \myemphb{Must read!} } \smallskip \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{How to represent words?} \myemph{The semantic properties of the words are encoded in the dimensions of the vector} \begin{center} \includegraphics[width=.5\textwidth]{king-analogy-viz} \end{center} Can be learned in several ways: \begin{itemize} \item Extract handcrafted meaningful features \item \myemph{Use a neural network!}<2-> \end{itemize} \vfill \source{ \textbf{\url{http://jalammar.github.io/illustrated-word2vec/}} \la\ \myemphb{Must read!} } \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Word embeddings: word2vec} Language modelling task: given a prefix (sequence of words), predict the next word \begin{columns}[c] \begin{column}{.5\textwidth} \begin{center} \textbf{CBOW}\\ \includegraphics[width=4cm]{cbow} \end{center} \end{column} \begin{column}{.5\textwidth} \begin{center} \textbf{SkipGram}\\ \includegraphics[width=4cm]{skipgram} \end{center} \end{column} \end{columns} \source{ \textbf{\url{http://jalammar.github.io/illustrated-word2vec/}} \la\ \myemphb{Must read!} } \source{Mikolov et al. \textbf{Efficient Estimation of Word Representations in Vector Space} \cite{mikolov2013}} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Why does it work?} \begin{center} \includegraphics[width=0.8\textwidth]{word_embeddings} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Why does it work?} \begin{itemize} \item Let's assume that the word representations are \myemph{organised semantically} \item words $w_1$ and $w_2$ having similar meaning would be \textbf{close to each other} in this space \item[] \ra\ Consequently $\mathcal{F}(w_1) \approx \mathcal{F}(w_2)$ \end{itemize} \begin{columns}[c] \begin{column}{.5\textwidth} \begin{itemize} \item[] Language modelling: \end{itemize} \begin{enumerate} \item I have got \edinred{10} \blue{euros} in my wallet \item This item costs \liumgreen{11} \blue{euros} \item In the U.S. it is \liumgreen{11} \edinorange{dollars} ! \end{enumerate} \end{column} \begin{column}{.5\textwidth} \begin{center} \includegraphics<1>[width=0.8\textwidth]{fflm_generalisation} \includegraphics<2>[width=0.8\textwidth]{fflm_generalisation2} \end{center} \end{column} \end{columns} \Ra\ What is the probability that \edinred{10} is followed by \edinorange{dollars}? \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{How to represent sentences?} Sentence = sequence of word \Ra\ we need an \myemphb{encoder} Several possibilities have been developed: \begin{itemize} \item<2-> \myemph{Recurrent neural network} (RNN) \begin{itemize} \item and its \textbf{bidirectional} version \item representation = single vector or matrix \item[] \end{itemize} \item<4-> \myemph{Convolutional neural network} (CNN) \begin{itemize} \item produces a single vector representation \item[] \end{itemize} \item<5-> Very recently \myemph{Transformers} = self-attention \begin{itemize} \item What error? \Ra\ Error function depending on the task \item Estimating a real value: \ra\ \item representation = matrix (1 vector per word) \item Must read: \textbf{\url{http://jalammar.github.io/illustrated-transformer/}} \end{itemize} \end{itemize} \begin{textblock*}{30mm}[0,0](110mm,0mm) \includegraphics<2>[height=4cm]{figures/rnn_proj} \includegraphics<3->[height=4cm]{figures/rnn_proj2} \end{textblock*} \begin{textblock*}{30mm}[0,0](110mm,35mm) \includegraphics<4->[height=0.25\textheight]{figures/conv_sent_encoder} \end{textblock*} \begin{textblock*}{30mm}[0,0](110mm,35mm) \includegraphics<5->[height=0.25\textheight]{figures/conv_sent_encoder} \end{textblock*} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{How to classify sentences?} \begin{itemize} \item The classifier is a neural network implementing a complex function $\mathcal{F}$ \begin{itemize} \item that operates in the \textbf{continuous space} \item that maps input vectors to a \textbf{probability distribution} over the desired classes \end{itemize} \end{itemize} \begin{enumerate} \item Encode the sentence \begin{itemize} \item get a vector \item get a matrix (1 vector per word) \ra\ compress into 1 vector \begin{itemize} \item \textbf{pooling} operation (usually mean or max) \item concatenation \end{itemize} \end{itemize} \item Non-linear classification layer(s) \ra\ get a vector of scores $\vz$ (1 for each class) \item Get a probability distribution by normalization \ra\ softmax \begin{itemize} \item[] \begin{center} $p(\vc = j | \theta) = \ds \frac{ e^{\vz_j}}{\ds \sum_{k=1}^{\|V\|} e^{\vz_k}}$ \end{center} \end{itemize} \end{enumerate} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Encoding a sentence with a (bi-)RNN} Sentence: "\textbf{A long time ago in a galaxy far , far away}" \begin{center} % \includegraphics[height=0.8\textheight]{figures/rnn_seq_1}<+>% if you remove the '%' then the % \includegraphics[height=0.8\textheight]{figures/rnn_seq_2}<+>% % \includegraphics[height=0.8\textheight]{figures/rnn_seq_3}<+>% % \includegraphics[height=0.8\textheight]{figures/rnn_seq_7}<+>% % \includegraphics[height=0.8\textheight]{figures/rnn_seq_10}<+>% % \includegraphics[height=0.8\textheight]{figures/rnn_seq_all}<+>% \includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_1}<+>% \includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_2}<+>% \includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_7}<+>% \includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_fall}<+>% \includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_r1}<+>% \includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_r2}<+>% \includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_r3}<+>% \includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_rall}<+>% \includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_all}<+>% \end{center}%centering \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Pooling operation} Compute the feature-wise \myemph{average} or \myemph{maximum} \textbf{activation} of a set of vectors\\ Aim: sub-sampling \ra\ result is a vector! \begin{center} \includegraphics[height=0.5\textheight]{figures/pooling}% \end{center} \source{A comment on max pooling to read: \url{https://mirror2image.wordpress.com/2014/11/11/geoffrey-hinton-on-max-pooling-reddit-ama/}} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Classification layer \Ra\ Softmax} Get a probability distribution by normalization \ra\ softmax: $p(\vc = j | \theta) = \ds \frac{ e^{\vz_j}}{\ds \sum_{k=1}^{\|V\|} e^{\vz_k}}$ \begin{center} \includegraphics[height=0.6\textheight]{figures/classif_layer}% \end{center} \end{frame} ... ... @@ -188,11 +556,9 @@ y_i^{c} & = & f \left(\sum_j w^{c-1}_{ij} ~ y_j^{c-1}\right) \\ \myemph{Project} or represent the \textbf{text} into a \myemph{continuous space} and train an estimator operating into this space to compute the probability of the sentiment. \end{block} Basically it is like: %\includegraphics[width=0.75\textwidth]{sa_nn} \begin{center} \includegraphics[height=0.6\textheight]{sa_nn} \end{center} \end{frame} ... ...
