Commit 2c5b2051 authored by Loïc Barrault's avatar Loïc Barrault
Browse files

transformer model

parent 8533d34f
m2_trad_neuronale.pdf
m2_trad_neuronale_en.pdf
%!TEX root = m2_trad_neuronale.tex
\section{Transformers}
\sectionpage
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Modèle non récurrent pour la traduction}
\begin{block}{}
\begin{itemize}
\item Maîtrise du gradient difficile dans les RNN
\item[\ra] GRU et LSTM améliorent la chose
\item[\ra] Mécanisme d'attention permet de réduire le chemin des gradients
\item[]
\item[\ra] Nouveau modèle proposé par \cite{VaswaniSPUJGKP17}
\item Plus de récurrence,
\item Modèle largement paralélisable
\item Nouvel état de l'art depuis quelques mois !
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Transformer}
\begin{block}{"Attention is all you need", \cite{VaswaniSPUJGKP17}}
\centering{
\includegraphics[height=0.75\textheight]{transformer_architecture}
}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Transformer : composants clés}
\begin{columns}
\begin{column}{.60\textwidth}
\begin{block}{}
\begin{itemize}
\item Modèle encodeur/décodeur
\item[\ra] profond \Ra\ $N$ couches
\item Embedding positionnels
\item[\ra] Crée un embedding encodant la place du mot dans la phrase
\item Attention multiple (\texttt{multi-head})
\item Projection \texttt{feed-forward}
\item[\ra ] réduction de dimension
\item Connexions résiduelles
\item[\ra] Facilite la propagation des gradients (cf. cours sur les CNN)
\end{itemize}
\end{block}
\end{column}
\begin{column}{.40\textwidth}
%\begin{block}{"Attention is all you need", \cite{VaswaniSPUJGKP17}}
\centering{
\includegraphics[height=0.60\textheight]{transformer_architecture}
}
%\end{block}
\end{column}
\end{columns}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Embedding positionnels}
\begin{block}{}
\begin{itemize}
\item Modèle non récurrent, non convolutionnel
\item[\ra] perte de l'information positionnelle
\item[\ra] absolue ou relative
\item[]
\centering{ $PE_{(pos, 2i)} = sin(pos / 10000^{2i/emb})$
$PE_{(pos, 2i+1)} = cos(pos / 10000^{2i/emb})$ }
\end{itemize}
\centering{
\includegraphics[height=0.30\textheight]{positionnal_emb}
}
\begin{itemize}
\item Autre solution: apprendre un embedding \ra\ pas de différence
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Multi-head attention}
\begin{columns}
\begin{column}{.55\textwidth}
\begin{block}{}
\begin{itemize}
\item Attentions multiples
\item[\ra] focus sur plusieurs sous-espaces de représentations
\item[\ra] différentes positions
\item En comparaison: 1 seule attention moyenne cela
\end{itemize}
\end{block}
{\small
$MultiAtt(\vm{Q}, \vm{K}, \vm{V}) = Concat ( Att_1, \dots, Att_h) \mW^O $ \\
avec \\
$ Att_i = Attention(\vm{Q}\mW_i^Q, \vm{K}\mW_i^K, \vm{V}\mW_i^V)$
}
\end{column}
\begin{column}{.40\textwidth}
%\begin{block}{}
\centering{
\includegraphics[height=0.65\textheight]{multi_head_attention}
}
%\end{block}
\end{column}
\end{columns}\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Scaled dot product attention}
\begin{columns}
\begin{column}{.5\textwidth}
\begin{block}{}
\begin{itemize}
\item Attention = \textit{mapping}
\begin{itemize}
\item requête (\texttt{query Q})
\item ensemble de paires clé-valeur (\texttt{K, V})
\item[\ra] sortie \texttt{O}
\end{itemize}
\item \texttt{0} = $\sum_i \alpha_i V_i$ avec $\alpha_i = f(\texttt{Q}, \texttt{K})$
\item[\Ra]
$Attention(\texttt{Q}, \texttt{K}, \texttt{V}) = Softmax \left( \frac{\texttt{Q}\texttt{K}^T}{\sqrt{d_k}} \right) \texttt{V} $
\end{itemize}
\end{block}
\end{column}
\begin{column}{.45\textwidth}
%\begin{block}{}
\centering{
\includegraphics[height=0.75\textheight]{scaled_dot_product_attention}
}
%\end{block}
\end{column}
\end{columns}\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Transformer : attention}
\begin{block}{3 différentes utilisations de l'attention}
\begin{itemize}
\item Traditionnelle :
\begin{itemize}
\item \texttt{Q} : couche décodeur précédente, \texttt{K} et \texttt{V} : sortie de l'encodeur
\item[\ra] Décodeur porte son attention sur les entrées
\end{itemize}\begin{itemize}
\item Auto-attention : \texttt{self-attention} dans l'encodeur
\item \texttt{Q}, \texttt{K} et \texttt{V} : couche encodeur précédente
\end{itemize}\item Auto-attention : \texttt{self-attention} dans le décodeur
\begin{itemize}
\item \texttt{Q}, \texttt{K} et \texttt{V} : couche décodeur précédente
\item[!] ne prend en compte que le contexte gauche (modèle auto-régressif)
\end{itemize}\end{itemize}\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Transformer : animation}
\vspace{2cm}
\begin{block}{}
\centering{
\href{https://3.bp.blogspot.com/-aZ3zvPiCoXM/WaiKQO7KRnI/AAAAAAAAB_8/7a1CYjp40nUg4lKpW7covGZJQAySxlg8QCLcBGAs/s1600/transform20fps.gif}{\huge{Animation}}
}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Transformer : visualisation de l'attention}
\begin{block}{L'attention est modifiée en fonction de la phrase}
\centering{
\includegraphics[width=0.70\textwidth]{transformer_inner_attention_src}
\vspace{.5cm}
\includegraphics[width=0.70\textwidth]{transformer_inner_attention}
}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Transformer : visualisation de l'attention}
\begin{itemize}
\item La combinaison des attentions modélise des phénomènes linguistiques
\end{itemize}
\begin{block}{Anaphores}
\centering{
\includegraphics[width=0.70\textwidth]{att_vis_1}
}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Impact sur l'état de l'art}
\begin{itemize}
\item Campagnes d'évaluation WMT
\end{itemize}
\begin{block}{}
\begin{itemize}
\item 2017: \url{http://matrix.statmt.org/matrix/systems_list/1869} %2017
\item 2018: \url{http://matrix.statmt.org/matrix/systems_list/1881} %2018
\end{itemize}
\end{block}
\end{frame}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment