Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Loïc Barrault
supports_cours
Commits
2c5b2051
Commit
2c5b2051
authored
Dec 14, 2018
by
Loïc Barrault
Browse files
transformer model
parent
8533d34f
Changes
11
Hide whitespace changes
Inline
Side-by-side
trad_neuronale/figures/.gitignore
0 → 100644
View file @
2c5b2051
m2_trad_neuronale.pdf
m2_trad_neuronale_en.pdf
trad_neuronale/figures/att_vis_1.pdf
0 → 100644
View file @
2c5b2051
File added
trad_neuronale/figures/multi_head_attention.pdf
0 → 100644
View file @
2c5b2051
File added
trad_neuronale/figures/positionnal_emb.png
0 → 100644
View file @
2c5b2051
8.79 KB
trad_neuronale/figures/scaled_dot_product_attention.pdf
0 → 100644
View file @
2c5b2051
File added
trad_neuronale/figures/transformer_20fps.gif
0 → 100644
View file @
2c5b2051
1.18 MB
trad_neuronale/figures/transformer_architecture.pdf
0 → 100644
View file @
2c5b2051
File added
trad_neuronale/figures/transformer_inner_attention.png
0 → 100644
View file @
2c5b2051
41 KB
trad_neuronale/figures/transformer_inner_attention_src.png
0 → 100644
View file @
2c5b2051
12.1 KB
trad_neuronale/figures/transformer_paper_Vaswani.pdf
0 → 100644
View file @
2c5b2051
File added
trad_neuronale/transformer.tex
0 → 100644
View file @
2c5b2051
%!TEX root = m2_trad_neuronale.tex
\section
{
Transformers
}
\sectionpage
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Modèle non récurrent pour la traduction
}
\begin{block}
{}
\begin{itemize}
\item
Maîtrise du gradient difficile dans les RNN
\item
[\ra]
GRU et LSTM améliorent la chose
\item
[\ra]
Mécanisme d'attention permet de réduire le chemin des gradients
\item
[]
\item
[\ra]
Nouveau modèle proposé par
\cite
{
VaswaniSPUJGKP17
}
\item
Plus de récurrence,
\item
Modèle largement paralélisable
\item
Nouvel état de l'art depuis quelques mois !
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Transformer
}
\begin{block}
{
"Attention is all you need",
\cite
{
VaswaniSPUJGKP17
}}
\centering
{
\includegraphics
[height=0.75\textheight]
{
transformer
_
architecture
}
}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Transformer : composants clés
}
\begin{columns}
\begin{column}
{
.60
\textwidth
}
\begin{block}
{}
\begin{itemize}
\item
Modèle encodeur/décodeur
\item
[\ra]
profond
\Ra\
$
N
$
couches
\item
Embedding positionnels
\item
[\ra]
Crée un embedding encodant la place du mot dans la phrase
\item
Attention multiple (
\texttt
{
multi-head
}
)
\item
Projection
\texttt
{
feed-forward
}
\item
[\ra ]
réduction de dimension
\item
Connexions résiduelles
\item
[\ra]
Facilite la propagation des gradients (cf. cours sur les CNN)
\end{itemize}
\end{block}
\end{column}
\begin{column}
{
.40
\textwidth
}
%\begin{block}{"Attention is all you need", \cite{VaswaniSPUJGKP17}}
\centering
{
\includegraphics
[height=0.60\textheight]
{
transformer
_
architecture
}
}
%\end{block}
\end{column}
\end{columns}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Embedding positionnels
}
\begin{block}
{}
\begin{itemize}
\item
Modèle non récurrent, non convolutionnel
\item
[\ra]
perte de l'information positionnelle
\item
[\ra]
absolue ou relative
\item
[]
\centering
{
$
PE
_{
(
pos,
2
i
)
}
=
sin
(
pos
/
10000
^{
2
i
/
emb
}
)
$
$
PE
_{
(
pos,
2
i
+
1
)
}
=
cos
(
pos
/
10000
^{
2
i
/
emb
}
)
$
}
\end{itemize}
\centering
{
\includegraphics
[height=0.30\textheight]
{
positionnal
_
emb
}
}
\begin{itemize}
\item
Autre solution: apprendre un embedding
\ra\
pas de différence
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Multi-head attention
}
\begin{columns}
\begin{column}
{
.55
\textwidth
}
\begin{block}
{}
\begin{itemize}
\item
Attentions multiples
\item
[\ra]
focus sur plusieurs sous-espaces de représentations
\item
[\ra]
différentes positions
\item
En comparaison: 1 seule attention moyenne cela
\end{itemize}
\end{block}
{
\small
$
MultiAtt
(
\vm
{
Q
}
,
\vm
{
K
}
,
\vm
{
V
}
)
=
Concat
(
Att
_
1
,
\dots
, Att
_
h
)
\mW
^
O
$
\\
avec
\\
$
Att
_
i
=
Attention
(
\vm
{
Q
}
\mW
_
i
^
Q,
\vm
{
K
}
\mW
_
i
^
K,
\vm
{
V
}
\mW
_
i
^
V
)
$
}
\end{column}
\begin{column}
{
.40
\textwidth
}
%\begin{block}{}
\centering
{
\includegraphics
[height=0.65\textheight]
{
multi
_
head
_
attention
}
}
%\end{block}
\end{column}
\end{columns}\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Scaled dot product attention
}
\begin{columns}
\begin{column}
{
.5
\textwidth
}
\begin{block}
{}
\begin{itemize}
\item
Attention =
\textit
{
mapping
}
\begin{itemize}
\item
requête (
\texttt
{
query Q
}
)
\item
ensemble de paires clé-valeur (
\texttt
{
K, V
}
)
\item
[\ra]
sortie
\texttt
{
O
}
\end{itemize}
\item
\texttt
{
0
}
=
$
\sum
_
i
\alpha
_
i V
_
i
$
avec
$
\alpha
_
i
=
f
(
\texttt
{
Q
}
,
\texttt
{
K
}
)
$
\item
[\Ra]
$
Attention
(
\texttt
{
Q
}
,
\texttt
{
K
}
,
\texttt
{
V
}
)
=
Softmax
\left
(
\frac
{
\texttt
{
Q
}
\texttt
{
K
}^
T
}{
\sqrt
{
d
_
k
}}
\right
)
\texttt
{
V
}
$
\end{itemize}
\end{block}
\end{column}
\begin{column}
{
.45
\textwidth
}
%\begin{block}{}
\centering
{
\includegraphics
[height=0.75\textheight]
{
scaled
_
dot
_
product
_
attention
}
}
%\end{block}
\end{column}
\end{columns}\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Transformer : attention
}
\begin{block}
{
3 différentes utilisations de l'attention
}
\begin{itemize}
\item
Traditionnelle :
\begin{itemize}
\item
\texttt
{
Q
}
: couche décodeur précédente,
\texttt
{
K
}
et
\texttt
{
V
}
: sortie de l'encodeur
\item
[\ra]
Décodeur porte son attention sur les entrées
\end{itemize}\begin{itemize}
\item
Auto-attention :
\texttt
{
self-attention
}
dans l'encodeur
\item
\texttt
{
Q
}
,
\texttt
{
K
}
et
\texttt
{
V
}
: couche encodeur précédente
\end{itemize}
\item
Auto-attention :
\texttt
{
self-attention
}
dans le décodeur
\begin{itemize}
\item
\texttt
{
Q
}
,
\texttt
{
K
}
et
\texttt
{
V
}
: couche décodeur précédente
\item
[!]
ne prend en compte que le contexte gauche (modèle auto-régressif)
\end{itemize}\end{itemize}\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Transformer : animation
}
\vspace
{
2cm
}
\begin{block}
{}
\centering
{
\href
{
https://3.bp.blogspot.com/-aZ3zvPiCoXM/WaiKQO7KRnI/AAAAAAAAB
_
8/7a1CYjp40nUg4lKpW7covGZJQAySxlg8QCLcBGAs/s1600/transform20fps.gif
}{
\huge
{
Animation
}}
}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Transformer : visualisation de l'attention
}
\begin{block}
{
L'attention est modifiée en fonction de la phrase
}
\centering
{
\includegraphics
[width=0.70\textwidth]
{
transformer
_
inner
_
attention
_
src
}
\vspace
{
.5cm
}
\includegraphics
[width=0.70\textwidth]
{
transformer
_
inner
_
attention
}
}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Transformer : visualisation de l'attention
}
\begin{itemize}
\item
La combinaison des attentions modélise des phénomènes linguistiques
\end{itemize}
\begin{block}
{
Anaphores
}
\centering
{
\includegraphics
[width=0.70\textwidth]
{
att
_
vis
_
1
}
}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Impact sur l'état de l'art
}
\begin{itemize}
\item
Campagnes d'évaluation WMT
\end{itemize}
\begin{block}
{}
\begin{itemize}
\item
2017:
\url
{
http://matrix.statmt.org/matrix/systems
_
list/1869
}
%2017
\item
2018:
\url
{
http://matrix.statmt.org/matrix/systems
_
list/1881
}
%2018
\end{itemize}
\end{block}
\end{frame}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment