Commit f1439bb4 authored by Loïc Barrault's avatar Loïc Barrault
Browse files

textproc nn

parent 38809fb0
......@@ -8,3 +8,10 @@
year={2016}
}
@misc{mikolov2013,
title = {Efficient Estimation of Word Representations in Vector Space},
author = {Tomas Mikolov and Kai Chen and Greg S. Corrado and Jeffrey Dean},
year = {2013},
URL = {http://arxiv.org/abs/1301.3781}
}
......@@ -92,9 +92,9 @@
%\input{ie_plan.tex}
%\input{ie_introduction.tex} % 4th lecture + start 5th lecture
%\input{ie_ner.tex} % end of 5th lecture + 6th lecture
%\input{ie_relation_extraction.tex} % 7th lecture
%\input{ie_relation_extraction.tex} % 7th lecture + 8th lecture (shorter)
\input{textproc_nn.tex}
\input{textproc_nn.tex} % 9th lecture + eventually 10th
\input{sa_extra_reading}
......
......@@ -82,7 +82,7 @@ Hebb: \myemph{``Neurons that fire together, wire together''}
\vspace{.5cm}
Training method: change the weights $\vw$ if a training example $\vx$ is misclassified as follows:
\begin{itemize}
\item[] $\hat{\vw}^{new} = \hat{\vw}^{cur} + \hat{\vx} . y $ ~~~ with ~~~ $ y \in \{+1, -1\}$
\item[] $\vw^{new} = \vw^{cur} + \vx . y $ ~~~ with ~~~ $ y \in \{+1, -1\}$
\end{itemize}
\end{frame}
......@@ -136,39 +136,407 @@ y_i^{c} & = & f \left(\sum_j w^{c-1}_{ij} ~ y_j^{c-1}\right) \\
\frametitle{How to train a multilayer perceptron?}
\begin{block}{\center \textbf{Backpropagation}}
\begin{block}{\center \myemphb{Backpropagation}: Backward propagation of errors}
%\begin{center}
\begin{columns}
\begin{column}{.5\textwidth}
\[ \wij^{new} = \wij^{cur} - \lambda \frac{\partial E}{\partial \wij} \]
\end{column}
\begin{column}{.5\textwidth}
\begin{itemize}
\item $E$: \textbf{loss function}
\item $\lambda$: \textbf{learning rate}
\item $\wij$: weight between neuron $i$ and $j$
\end{itemize}
\end{column}
\end{columns}
%\end{center}
\end{block}
\begin{itemize}
\item Error function depending on the task
\item Classification task \Ra\ estimate a probability distribution
\[
\begin{array}[t]{rcl@{\hspace{1cm}}rcl}
y_i & = & \ds \frac{e^{a_i}}{\sum_k e^{a_k}}
& \ds {\partial y_i} / {\partial a_k} & = & \delta_{ik}y_i - y_i y_k \\[10pt]
\ds E(\vy,\vc) & = & \ds \sum_i c_i \log y_i
& \ds {\partial E} / {\partial y_i} & = & \ds \frac{c_i}{y_i}
\end{array}
\]
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{How to train a multilayer perceptron?}
\begin{columns}[c]
\begin{column}{.5\textwidth}
\begin{block}{\center \myemphb{Chain rule}}
\begin{center}
$
\ds \frac{\partial \mathbf{E}}{\partial \mathbf{W}} =
\frac{\color{liumgreen} \partial \mathbf{E}}{\color{edinorange} \partial \mathbf{h^{2}}}
\frac{\color{edinorange} \partial \mathbf{h^{2}}}{\color{cyan} \partial \mathbf{h^{1}}}
\frac{\color{cyan} \partial \mathbf{h^{1}}}{\partial \mathbf{W}}
$
\end{center}
\end{block}
\end{column}
\begin{column}{.5\textwidth}
\begin{center} \includegraphics[width=4cm]{mlp_bp_grad} \end{center}
\end{column}
\end{columns}
\textbf{Output layer}
\[
\ds \frac{\partial E}{\partial \wij} = \ds \underbrace{\frac{\partial E}{\partial a_i}}_{\delta_i} \, \frac{\partial a_i}{\partial \wij} = \delta_i \, h_j
\text{~~with~~}
\delta_i = \ds \frac{\partial E}{\partial y_i} \, \frac{\partial y_i}{\partial a_i} = \ds \frac{\partial E}{\partial y_i} \, f^{~'}(a_i)
\]
\textbf{Hidden layer}
\[
\ds \frac{\partial E}{\partial v_{jk}} = \ds \underbrace{\frac{\partial E}{\partial z_j}}_{\gamma_j} \, \frac{\partial z_j}{\partial v_{jk}} = \gamma_j \,x_k
\text{~~with~~}
\gamma_j = \ds \sum_i \frac{\partial E}{\partial a_i} \, \frac{\partial a_i}{\partial h_j} \, \frac{\partial h_j}{\partial z_j}
= \ds \sum_i \delta_i \, \wij \, f^{~'}(z_j)
= f^{~'}(z_j) \ds \sum_i \delta_i \wij
\]
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Multilayer perceptron: training}
\begin{itemize}
\item[1.] Normalise data
\item[2.] Initialise the weights $\mW$
\item[3.] \alert{Repeat}
\begin{itemize}
\item Pick a \textbf{batch} of examples $(\vx,\vc)$
\item \textbf{Forward} pass: propagate the batch $\vx$ through the network \ra\ $\vy$
\item Calculate the error $E(\vy,\vc)$
\item \textbf{Backward} pass: \myemphb{backpropagation} \ra\ $\nabla \wij$
\item Update weights $\wij^{new} = \wij^{cur} - \lambda \frac{\partial E}{\partial \wij}$
\item Eventually change the training meta-parameters (e.g. learning rate $\lambda$)
\end{itemize}
\item[ ] \alert{until convergence}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{}
\vfill
\centering
\Huge{\liumcyan{That's great, but where is the text?!?}}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{How to represent words?}
\begin{block}{\center \myemphb{Word Embedding}}
\begin{center}
\Large{Backward propagation of errors}
Vector representation of a word \Ra\ vector of real values\\
\end{center}
\end{block}
Also called continuous space representation.
\begin{itemize}
\item<2-> What would be the simplest way of obtaining vectors? \only<3->{\Ra\ so-called \myemphb{1-hot vector}:}
\item[]<3-> \begin{itemize}
\item vector of size equal to \textbf{vocabulary size}
\item contains 0 everywhere except for a single 1 at a specific position
\end{itemize}
\vspace{1cm}
\item<4-> Is that a good representation? \only<5->{\Ra\ \textbf{NO!}}
\item[]<5-> \begin{itemize}
\item distance between any two words is the same for all word pairs
\item position of the "1" arbitrarily
\item \ra\ it is just a \textbf{coding}
\end{itemize}
\end{itemize}
\only<3->{
\begin{textblock*}{50mm}[0,0](105mm,40mm)
\includegraphics[width=4cm]{one-hot}
\end{textblock*}
}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{How to represent words?}
\myemph{The semantic properties of the words are encoded in the dimensions of the vector}
\begin{minipage}[t][.7\textheight]{\textwidth}
\centering
\includegraphics[width=.7\textwidth]{king-white-embedding}<1->
\includegraphics[width=.7\textwidth]{king-colored-embedding}<2->
\includegraphics[width=.4\textwidth]{queen-woman-girl-embeddings}<3->
\end{minipage}
\vfill
\source{ \textbf{\url{http://jalammar.github.io/illustrated-word2vec/}} \la\ \myemphb{Must read!} }
\smallskip
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{How to represent words?}
\myemph{The semantic properties of the words are encoded in the dimensions of the vector}
\begin{center}
\includegraphics[width=.5\textwidth]{king-analogy-viz}
\end{center}
Can be learned in several ways:
\begin{itemize}
\item Extract handcrafted meaningful features
\item \myemph{Use a neural network!}<2->
\end{itemize}
\vfill
\source{ \textbf{\url{http://jalammar.github.io/illustrated-word2vec/}} \la\ \myemphb{Must read!} }
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Word embeddings: word2vec}
Language modelling task: given a prefix (sequence of words), predict the next word
\begin{columns}[c]
\begin{column}{.5\textwidth}
\begin{center}
\textbf{CBOW}\\
\includegraphics[width=4cm]{cbow}
\end{center}
\end{column}
\begin{column}{.5\textwidth}
\begin{center}
\textbf{SkipGram}\\
\includegraphics[width=4cm]{skipgram}
\end{center}
\end{column}
\end{columns}
\source{ \textbf{\url{http://jalammar.github.io/illustrated-word2vec/}} \la\ \myemphb{Must read!} }
\source{Mikolov et al. \textbf{Efficient Estimation of Word Representations in Vector Space} \cite{mikolov2013}}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Why does it work?}
\begin{center}
\includegraphics[width=0.8\textwidth]{word_embeddings}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Why does it work?}
\begin{itemize}
\item Let's assume that the word representations are \myemph{organised semantically}
\item words $w_1$ and $w_2$ having similar meaning would be \textbf{close to each other} in this space
\item[] \ra\ Consequently $\mathcal{F}(w_1) \approx \mathcal{F}(w_2)$
\end{itemize}
\begin{columns}[c]
\begin{column}{.5\textwidth}
\begin{itemize}
\item[] Language modelling:
\end{itemize}
\begin{enumerate}
\item I have got \edinred{10} \blue{euros} in my wallet
\item This item costs \liumgreen{11} \blue{euros}
\item In the U.S. it is \liumgreen{11} \edinorange{dollars} !
\end{enumerate}
\end{column}
\begin{column}{.5\textwidth}
\begin{center}
\includegraphics<1>[width=0.8\textwidth]{fflm_generalisation}
\includegraphics<2>[width=0.8\textwidth]{fflm_generalisation2}
\end{center}
\end{column}
\end{columns}
\Ra\ What is the probability that \edinred{10} is followed by \edinorange{dollars}?
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{How to represent sentences?}
Sentence = sequence of word \Ra\ we need an \myemphb{encoder}
Several possibilities have been developed:
\begin{itemize}
\item<2-> \myemph{Recurrent neural network} (RNN)
\begin{itemize}
\item and its \textbf{bidirectional} version
\item representation = single vector or matrix
\item[]
\end{itemize}
\item<4-> \myemph{Convolutional neural network} (CNN)
\begin{itemize}
\item produces a single vector representation
\item[]
\end{itemize}
\item<5-> Very recently \myemph{Transformers} = self-attention
\begin{itemize}
\item What error? \Ra\ Error function depending on the task
\item Estimating a real value: \ra\
\item representation = matrix (1 vector per word)
\item Must read: \textbf{\url{http://jalammar.github.io/illustrated-transformer/}}
\end{itemize}
\end{itemize}
\begin{textblock*}{30mm}[0,0](110mm,0mm)
\includegraphics<2>[height=4cm]{figures/rnn_proj}
\includegraphics<3->[height=4cm]{figures/rnn_proj2}
\end{textblock*}
\begin{textblock*}{30mm}[0,0](110mm,35mm)
\includegraphics<4->[height=0.25\textheight]{figures/conv_sent_encoder}
\end{textblock*}
\begin{textblock*}{30mm}[0,0](110mm,35mm)
\includegraphics<5->[height=0.25\textheight]{figures/conv_sent_encoder}
\end{textblock*}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{How to classify sentences?}
\begin{itemize}
\item The classifier is a neural network implementing a complex function $\mathcal{F}$
\begin{itemize}
\item that operates in the \textbf{continuous space}
\item that maps input vectors to a \textbf{probability distribution} over the desired classes
\end{itemize}
\end{itemize}
\begin{enumerate}
\item Encode the sentence
\begin{itemize}
\item get a vector
\item get a matrix (1 vector per word) \ra\ compress into 1 vector
\begin{itemize}
\item \textbf{pooling} operation (usually mean or max)
\item concatenation
\end{itemize}
\end{itemize}
\item Non-linear classification layer(s) \ra\ get a vector of scores $\vz$ (1 for each class)
\item Get a probability distribution by normalization \ra\ softmax
\begin{itemize}
\item[] \begin{center} $p(\vc = j | \theta) = \ds \frac{ e^{\vz_j}}{\ds \sum_{k=1}^{\|V\|} e^{\vz_k}}$ \end{center}
\end{itemize}
\end{enumerate}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Encoding a sentence with a (bi-)RNN}
Sentence: "\textbf{A long time ago in a galaxy far , far away}"
\begin{center}
% \includegraphics[height=0.8\textheight]{figures/rnn_seq_1}<+>% if you remove the '%' then the
% \includegraphics[height=0.8\textheight]{figures/rnn_seq_2}<+>%
% \includegraphics[height=0.8\textheight]{figures/rnn_seq_3}<+>%
% \includegraphics[height=0.8\textheight]{figures/rnn_seq_7}<+>%
% \includegraphics[height=0.8\textheight]{figures/rnn_seq_10}<+>%
% \includegraphics[height=0.8\textheight]{figures/rnn_seq_all}<+>%
\includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_1}<+>%
\includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_2}<+>%
\includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_7}<+>%
\includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_fall}<+>%
\includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_r1}<+>%
\includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_r2}<+>%
\includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_r3}<+>%
\includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_rall}<+>%
\includegraphics[height=0.8\textheight]{figures/bi_rnn_seq_all}<+>%
\end{center}%centering
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Pooling operation}
Compute the feature-wise \myemph{average} or \myemph{maximum} \textbf{activation} of a set of vectors\\
Aim: sub-sampling \ra\ result is a vector!
\begin{center}
\includegraphics[height=0.5\textheight]{figures/pooling}%
\end{center}
\source{A comment on max pooling to read: \url{https://mirror2image.wordpress.com/2014/11/11/geoffrey-hinton-on-max-pooling-reddit-ama/}}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Classification layer \Ra\ Softmax}
Get a probability distribution by normalization \ra\ softmax: $p(\vc = j | \theta) = \ds \frac{ e^{\vz_j}}{\ds \sum_{k=1}^{\|V\|} e^{\vz_k}}$
\begin{center}
\includegraphics[height=0.6\textheight]{figures/classif_layer}%
\end{center}
\end{frame}
......@@ -188,11 +556,9 @@ y_i^{c} & = & f \left(\sum_j w^{c-1}_{ij} ~ y_j^{c-1}\right) \\
\myemph{Project} or represent the \textbf{text} into a \myemph{continuous space} and train an estimator operating into this space to compute the probability of the sentiment.
\end{block}
Basically it is like:
%\includegraphics[width=0.75\textwidth]{sa_nn}
\begin{center}
\includegraphics[height=0.6\textheight]{sa_nn}
\end{center}
\end{frame}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment