Commit 7ca10ba3 authored by Loïc Barrault's avatar Loïc Barrault
Browse files

various modifs

parent 645f1e98
......@@ -291,7 +291,7 @@
\includegraphics[valign=t, width=0.19\textwidth]{figures/padding_strides_02}
\includegraphics[valign=t, width=0.19\textwidth]{figures/padding_strides_03}
}
\item Ex: noyau $k=$ 3 x 3, entrée $i=$ 6 x 6, padding $p=$ 1 x 1, pas $s=$ 2 x 2
\item Ex: noyau $k=$ 3 x 3, entrée $i=$ 5 x 5, padding $p=$ 1 x 1, pas $s=$ 2 x 2
\end{itemize}
\end{block}
\end{frame}
......@@ -314,7 +314,7 @@
\includegraphics[valign=t, width=0.15\textwidth]{figures/padding_strides_odd_03}
}
\item \alert{Note} : la dernière ligne et la colonne de droite de l'entrée ne sont pas traitées !
\item Malgré si les tailles d'entrée différentes, les sorties sont de même tailles (noyau fixé)
\item Malgré les tailles d'entrée différentes, les sorties sont de même tailles (noyau fixé)
\end{itemize}
\end{block}
\end{frame}
......@@ -328,6 +328,9 @@
\item[]
\item Que se passe-t-il si on veut aller dans l'autre sens ?
\item[\ra] Notions de convolution transposée
\item[\ra] Déconvolution
\item[]
\item Visualisation \url{http://scs.ryerson.ca/~aharley/vis/conv/}
\end{itemize}
\end{block}
\end{frame}
......@@ -421,7 +424,7 @@
\item Même opération que pour le RNN, mais pour chaque paire de mots
\item[]\centerline{$p = tanh \left( \mW \colvec{c_1\\c_2 } + \vb \right) $}
\item convolution sur le vecteurs de mots
\item[\ra] Les poids $mW$ et $b$ sont partagés (nombre de paramètres réduits)
\item[\ra] Les poids $\mW$ et $\vb$ sont partagés (nombre de paramètres réduits)
\end{itemize}
\end{block}
\end{frame}
......@@ -493,7 +496,7 @@
\item Fenêtres de taille $h$ possibles : $\{ \vx_{1:h}, \vx_{2:h+1}, \dots, \vx_{n-h+1:n}\}$
\item[] Le résultat : carte de caractéristiques (\bf{feature map})
\begin{itemize}
\item $\vm{c} = [c_1, c_2, \dots, c_n-h+1] \in \mathbb{R}^{n-h+1}$
\item $\vm{c} = [c_1, c_2, \dots, c_{n-h+1}] \in \mathbb{R}^{n-h+1}$
\end{itemize}
\only<1>{ \centerline{\includegraphics[valign=t, width=0.5\textwidth]{figures/cnn_feat_map}} }
\only<2>{ \centerline{\includegraphics[valign=t, width=0.51\textwidth]{figures/cnn_feat_map_2}} }
......@@ -511,7 +514,7 @@
\item Nouvelle brique pour les réseaux de neurones: \textbf{pooling}
\item En particulier: couche de max pooling \textbf{temporel}
\item Idée : capture l'activation la plus importante à travers le temps
\item À partir d'une carte de caractéristiques $\vm{c} = [c_1, c_2, \dots, c_n-h+1] \in \mathbb{R}^{n-h+1}$
\item À partir d'une carte de caractéristiques $\vm{c} = [c_1, c_2, \dots, c_{n-h+1}] \in \mathbb{R}^{n-h+1}$
\begin{itemize}
\item[\ra] Conserver une seule valeur : $\hat{c} = max(\vm{c})$
\end{itemize}
......@@ -520,7 +523,7 @@
\item<+->[\ra] Solution 1: utiliser plusieurs filtres $\vw$
\only<1-5>{\begin{itemize}
\item<+-> Par exemple des filtres $\vw$ de tailles différentes ($h$)
\item<+-> À cause du \textbf{max pooling}, la taille de $\vm{c}$ ne change rien.
\item<+-> Grâce au \textbf{max pooling}, la taille de $\vm{c}$ ne change rien.
\item<+-> On peut utiliser des filtres qui regardent les unigrams, les bigrams, les trigrams, etc.
\end{itemize} }
\only<6-10>{\item<+->[\ra] Solution 2: (idée) utiliser plusieurs canaux
......@@ -579,11 +582,11 @@
\item[\ra] probabilité $p$ : hyperparamètre (à déterminer empiriquement)
\item Rappel : variable de Bernouilli :
\begin{itemize}
\item[] \centerline{ $ p(X = x) = \begin{cases} p \mathrm{~si~} x=1 \\ 1-p \mathrm{~si~} x=0 \\ 0 \mathrm{~sinon~} \end{cases}$ }
\item[] \centerline{ $ P(X = x) = \begin{cases} p \mathrm{~si~} x=1 \\ 1-p \mathrm{~si~} x=0 \\ 0 \mathrm{~sinon~} \end{cases}$ }
\end{itemize}
\item On ignore certaines caractéristiques pendant l'entraînement:
\centerline{ $ y = softmax \left( \mW^{(s)}(\vm{r} \circ \vm{z}) + \vb \right)$ }
\item[\ra] empêche le sur-apprentissage vers certaines configurations de caractéristiques
\item[\ra] empêche le sur-apprentissage
\end{itemize}
\end{block}
\end{frame}
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -17,22 +17,8 @@
\setbeamertemplate{navigation symbols}{}
\usepackage[french]{babel}
\usepackage[utf8]{inputenc}
\usepackage{times}
\usepackage{epsfig}
\usepackage{comment}
\usepackage{url}
\usepackage{multirow}
\usepackage[T1]{fontenc}
%\usepackage{natbib}
%\usepackage{multimedia}
\usepackage{array}
\setlength{\extrarowheight}{3pt}
%\usepackage{xspace}
%\usepackage{amsmath}
%\usepackage{array}
%\setlength{\extrarowheight}{3pt}
\usepackage{graphicx}
\usepackage[export]{adjustbox}
......@@ -53,7 +39,7 @@
\institute[LIUM, Le Mans Université]
{
loic.barrault@univ-lemans.fr \\
Laboratoire d'Informatique de l'Université du Maine \\
Le Mans Université \\
}
%\date{09 janvier 2017}
......
......@@ -41,7 +41,6 @@
%\lfoot{17 Novembre 2017}
\rfoot{\thepage\ / \pageref{LastPage}}
\cfoot{}
\newcommand{\ds}{\displaystyle}
\vspace{\stretch{1}}
......
......@@ -48,22 +48,8 @@
%\insertframenumber/\inserttotalframenumber\hfill}%
\usepackage[french]{babel}
\usepackage[utf8]{inputenc}
\usepackage{times}
\usepackage{epsfig}
\usepackage{comment}
\usepackage{url}
\usepackage{multirow}
\usepackage[T1]{fontenc}
%\usepackage{natbib}
%\usepackage{multimedia}
\usepackage{array}
\setlength{\extrarowheight}{3pt}
%\usepackage{xspace}
%\usepackage{amsmath}
\input ../macros.tex
\input ../macros_beamer.tex
......
\documentclass[t]{beamer}
%\documentclass[handout,t]{beamer}
% pdf2ps cm_parole.pdf;
% intro, codage + DTW: psselect -p 1-35,51-61-63 cm_parole.ps > cm_parol_poly.ps
% DTW alone: psselect -p 1-19,35-47 cm_parole.ps > cm_parole_poly.ps
% decode psselect -p1-47,51-74 cm_parole.ps > cm_parole_poly.ps
% psnup -4 -H96mm -W128mm -m15mm -b6mm cm_parole_poly.ps cm_parole_poly.ps4
%
%\usepackage{pgfpages}
%\pgfpagelayout{4 on 1}{a4paper,landscape}
\mode<presentation>
{
%\usetheme{PaloAlto}
% \usetheme{Hannover}
\usetheme{informatics}
\useoutertheme{infolines}
% \setbeamercovered{transparent} % or whatever (possibly just delete it)
}
\def\swidth{.7cm}
\setbeamersize{sidebar width left=\swidth}
\setbeamertemplate{sidebar left}
{
{\usebeamerfont{title in sidebar}%
\vskip1.5em%
\usebeamercolor[fg]{title in sidebar}%
\insertshorttitle[width=\swidth,center,respectlinebreaks]\par%
\vskip1.25em%
}%
{%
\usebeamercolor[fg]{author in sidebar}%
\usebeamerfont{author in sidebar}%
\insertshortauthor[width=\swidth,center,respectlinebreaks]\par%
\vskip1.25em%
}%
\hbox to2cm{\hss\insertlogo\hss}
\vskip1.25em%
\insertverticalnavigation{\swidth}%
\vfill
\hbox to2cm{\hskip0.6cm\usebeamerfont{subsection in
sidebar}\strut\usebeamercolor[fg]{subsection in
sidebar} }
\vskip3pt%
}%
%\insertframenumber/\inserttotalframenumber\hfill}%
\usepackage{array}
\setlength{\extrarowheight}{3pt}
\input ../macros_en.tex
\input ../macros_beamer.tex
\usepackage[absolute,showboxes,overlay]{textpos}
%\TPshowboxestrue % commenter une fois fini
\TPshowboxesfalse % décommenter pour faire disparaitre les boites
\textblockorigin{10mm}{10mm} % origine des positions
% This is only inserted into the PDF information catalog. Can be left out.
\subject{Neural Machine Translation}
\title[]{Neural Machine Translation}
\author[]{Lo\"ic Barrault}
\institute[LIUM, Le Mans Universit\'e]
{
loic.barrault@univ-lemans.fr \\
Laboratoire d'Informatique de l'Universit\'e du Maine \\
}
\date{}
% If you have a file called "university-logo-filename.xxx", where xxx
% is a graphic format that can be processed by latex or pdflatex,
% resp., then you can add a logo as follows:
%\pgfdeclareimage[height=0.5cm]{limsi-logo}{limsilogo}
%\logo{\pgfuseimage{limsi-logo}}
%\logo{\includegraphics[height=0.5cm]{limsilogo}}
%\logo{\epsfbox{limsilogo.eps}}
% Delete this, if you do not want the table of contents to pop up at
% the beginning of each subsection:
%\AtBeginSubsection[]
%{
% \begin{frame}<beamer>
% \frametitle{Outline}
% \tableofcontents[currentsection,currentsubsection]
% \end{frame}
%}
% If you wish to uncover everything in a step-wise fashion, uncomment
% the following command:
%\beamerdefaultoverlayspecification{<+->}
\newtheorem{conclusion}[theorem]{Conclusions}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
% tutorial TALN
% CM : SMT
%\input{mt_tutl.tex}
%Loic
%\input{mt_intro2.tex}
%\input{mt_ressources.tex}
%\input{mt_eval.tex}
%\input{mt_lm.tex}
%\input{mt_pivot.tex}
%\input{mt_datasel.tex}
\input{mt_neural_en.tex}
%Holger
%\input{cslm.tex}
%\input{mt_align.tex}
%\input{mt_decode.tex}
%\input{mt_tools.tex}
%\input{mt_nlp2.tex}
%\input{mt_asr2.tex}
%\input{mt_concl2.tex}
% old stuff
%\input{mt_plan.tex}
%\input{mt_intro.tex}
%\input{mt_bleu.tex}
%\input{mt_tasks_light.tex}
%\input{mt_smt_light.tex}
%\input{mt_sys_light.tex}
%\input{mt_results.tex} % obsolete, results of 2007
%\input{mt_nlp.tex} % old
%\input{mt_asr.tex} % old
% and kill the abominable icon
\setbeamertemplate{bibliography item}{}
\begin{frame}[allowframebreaks]
\frametitle{References}
% \bibliographystyle{amsalpha}
\bibliographystyle{apalike}
\bibliography{refs}
\end{frame}
\end{document}
%!TEX root = m2_trad_neuronale_en.tex
%\section{Neural Machine Translation}
%\subsection{Introduction}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Plan}
\begin{block}{}
\begin{itemize}
\item Motivations
\item Non-Markovian parametric models
\item Neural Machine Translation
\item Attention Mechanism
\item[]
\item[] Inspired by "\textit{From Sequence Modeling to Translation}", K. Cho, Univ. Montréal
\end{itemize}
\end{block}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Reminder: statistical approach}
\begin{block}{Formulation}
\begin{itemize}
\item[] \hspace{1cm} $ t^{*} = \argmax_t P(t|s) = \argmax_t P(s|t) P(t) $
\item[] \hspace{1cm} $ t^{*} = \argmax_t \sum_{i=1}^{N} \lambda_i\ FF_i(s,t) $
\end{itemize}
\end{block}
\begin{block}{Feature Functions}
\begin{itemize}
\item chosen by \textbf{experts}
\item[\ra] probability to translate $\hat{s}$ in $\hat{t}$, $\hat{t}$ in $\hat{s}$, lexical probabilities, LM etc.
\item[\ra] \alert{Non-parametric estimator}
\item inputs are (sequences of) words
\end{itemize}
\end{block}
\end{frame}
%\begin{frame}
% \frametitle{Problématiques}
%
%\begin{itemize}
% \item Traduction = transformation d'une séquence de mots d'une langue vers une autre
% \item Méthode à base de segment
% \item[\ra] Traduction par petits morceaux \Ra\ \textit{phrase pairs}
% \item[\ra] Probabilités associées aux \textit{phrases}
%\end{itemize}
%\end{frame}
\begin{frame}
\frametitle{Statements}
\begin{block}{Translation Table}
\begin{itemize}
\item Maximum \textit{phrase} length is important
\item[\ra] exploding number of PT entries
\item[]
\item Estimate probabilities by relative frequency
\item[\ra] Problematic for unfrequent sequences
\item concerns long sequences
\item[]
\item[\ra] \alert{Link to Markov hypothesis of order $n$ for the LMs: }
$ p(w_i | w_0, \cdots, w_{i-1}) \approx p(w_i | w_{i-n}, \cdots, w_{i-1}) $
\end{itemize}
\end{block}
\end{frame}
%----
\begin{frame}
\frametitle{Issues}
\begin{block}{}
\begin{itemize}
\item Create a model able to represent sequences from various length
\item[\ra] \alert{Can we get rid of Markov hypothesis?}
\item[\ra ] i.e. model $ p(w_i | w_0, \cdots, w_{i-1}) $ entirely
\item with a better probability estimation
\item[\ra] \alert{preferably a parametric estimator}
\end{itemize}
\end{block}
\end{frame}
%----
\begin{frame}
\frametitle{Parametric model}
\begin{block}{Reminder: feed-forward neural language model}
\begin{itemize}
\item Still an n-gram model
\item $ p(w_i | w_{i-n}, \cdots, w_{i-1}) \approx f(w_{i-n}, \cdots, w_{i-1}) $
\item $f$ : function estimating the probability of word $w_i$ from the $n-1$ previous words \ra\ learn with a NN
\end{itemize}
\end{block}
\centerline{
\includegraphics[width=0.40\textwidth]{figures/fflm_all}
}
\end{frame}
%----
\begin{frame}
\frametitle{Parametric model}
%\vspace{-.5cm}
\begin{block}{Feedforward NN}
\begin{description}
\item[1.] Word encoding with ''\alert{1-hot}'' vector
\item[\ra] $ w_j = \left[ 0, \cdots, 0, 1, 0, \cdots, 0 \right]^t $ (1 at indice $j$)
\item[2.] Project $w_j$ into a continuous espace
\item[\ra] $ p_j = \vm{W} ^t ~ w_j \in \mathbb{R}^d $ \\
\begin{itemize}
\item[] {\scriptsize with $\vm{W} \in \mathbb{R}^{\|V\| \times d}$ et $d \ll \|V\| $}
\item[] {\scriptsize $\|V\|$ : vocabulary size}
\end{itemize}
\item[3.] Concatenate \alert{context}
\item[\ra] $\vm{c} = [p_{j-n} ; \cdots ; p_{j-1}]^t$
\item[4.] Non-linear projection of the \alert{context}
\item[\ra] $\vm{d} = \phi ( \vm{U}^t ~ \vm{c} + \vm{b_U}) $ avec $\vm{b_U}$ le biais
\item[\ra] $\phi$: non-linear activation function
\end{description}
\end{block}
\begin{textblock*}{30mm}[0,0](83mm,20mm)
\includegraphics[height=5cm]{figures/fflm_proj}
\end{textblock*}
\end{frame}
%----
\begin{frame}
\frametitle{Parametric model}
\begin{block}{Feedforward NN}
%\begin{varblock}[7cm]{RdN Feedforward}
\begin{description}
\item[5 .] Calculate score
\item[\ra] $\vm{z} = \vm{S}^t ~ \vm{d} + \vm{b_S} $ ~~~ {\small with $\vm{b_S}$ the bias}
\item[]
\item[6 .] Calculate probability using \alert{softmax}
\item[\ra] $p(w_i = j | \cdots) = \ds \frac{ e^{z_j}}{\sum_{k=1}^{\|V\|} e^{z_k}}$
\begin{itemize}
\item[]
\item[] with $z_j$ the \jeme\ element of $\vm{z}$
\end{itemize}
\item[]
\end{description}
\end{block}
%\end{varblock}
\begin{textblock*}{30mm}[0,0](80mm,12mm)
\includegraphics[height=4.5cm]{figures/fflm_estim}
\end{textblock*}
\end{frame}
%----
\begin{frame}
\frametitle{Parametric model}
%\vspace{-.5cm}
\begin{block}{}
\begin{itemize}
% \item Entraînement par back-propagation du gradient d'erreur
\item Maximum likelihood + backpropagation
\item[]
\item[\ra] $\theta^* = \argmax_{\theta} \frac{1}{N} \ds \sum_{n=1}^N \sum_{i=1}^I \log p_{\theta} (w_i | w_{i-n}, \cdots, w_{i-1})$
\item[]
\item[] N : number of training exemples
\item[] $\theta = \{ \vm{W}, \vm{U}, \vm{S}, \vm{b_U}, \vm{b_S} \}$
\item[]
\item $\vm{W}$ : embeddings
\item $\vm{d}$ : sentence representation?
\item[]
\item[]
\end{itemize}
\end{block}
\begin{textblock*}{30mm}[0,0](68mm,23mm)
\includegraphics[height=4.5cm]{figures/fflm_all}
\end{textblock*}
\end{frame}
%----
\begin{frame}
\frametitle{Embeddings}
\begin{figure}
\centering
\includegraphics[height=6cm]{figures/Turian-WordTSNE_all}%
\onslide<2->{
\llap{\raisebox{1cm}{% move next graphics to top right corner
\centerline{
\includegraphics[width=0.6\textwidth]{figures/Turian-WordTSNE}
}
}}}
\end{figure}
\end{frame}
%----
\begin{frame}
\frametitle{Why does it work?}
\begin{block}{}
\begin{itemize}
\item Better probability estimation for unseen n-grams
\item[\ra] backoff LM: reduce context size + weighting
\item[\ra] neural LM: operates in the continuous space
\item[]
\item Neural LM represent the corpus by a \alert{continuous function}
\item[Ex.]
\begin{enumerate}
\item J'ai \edinred{10} \blue{euros} dans mon portefeuille
\item Cet objet co\^ute \liumgreen{11} \blue{euros}
\item Aux U.S. c'est \liumgreen{11} \edinorange{dollars} !
\end{enumerate}
\item[]
\item[] What is the probability of \edinred{10} being followed by \edinorange{dollars}?
\item[]
\end{itemize}
\end{block}
\end{frame}
%----
\begin{frame}
\begin{block}{}
\begin{itemize}
\item[] What is the probability of \edinred{10} being followed by \edinorange{dollars}?
\item[]
\item[]
\begin{figure}
\centering
\includegraphics<1>[width=0.6\textwidth]{figures/fflm_generalisation}
\includegraphics<2>[width=0.6\textwidth]{figures/fflm_generalisation2}
\end{figure}
\item[]
\item[\ra]<2> \edinred{10} and \liumgreen{11} are (often?) seen in the same \alert{context}
\item[\ra]<2> The parametric model allows for better generalization
\item[]
\end{itemize}
\end{block}
\end{frame}
%----
\begin{frame}
\frametitle{}
\begin{block}{}
\begin{itemize}
\item Can we get rid of Markov hypothesis?
\item[\ra] Non-Markovian model
\item[\ra] $ p(w_0, \cdots, w_{I}) \approx \prod_{i=1}^I p(w_i | \orange{w_0,\cdots,w_{i-1}})$
\item[]
\item We saw that: neural model provides better probability estimates
\item[\ra] better generalization, less sensitive to data \alert{sparsity}
\item[\Ra] could we increase the context size?
\item[]
\item Issue: \textbf{sentence length is unbounded!}
\item<2> Solution: \alert{compress history!}
\item[]
\end{itemize}
\end{block}
\end{frame}
%-------------------------------------------------------
%RNN
\begin{frame}
\frametitle{Recurrent Neural Network}
\begin{block}{}
\begin{itemize}
\item Problem: \textbf{sentences are of variable lengths!}
\item Solution: \alert{compress l'history !}
\end{itemize}
\end{block}
\begin{block}{Protocol}