Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Loïc Barrault
supports_cours
Commits
f1439bb4
Commit
f1439bb4
authored
Dec 16, 2019
by
Loïc Barrault
Browse files
textproc nn
parent
38809fb0
Changes
3
Show whitespace changes
Inline
Side-by-side
text_processing/refs.bib
View file @
f1439bb4
...
...
@@ -8,3 +8,10 @@
year
=
{2016}
}
@misc
{
mikolov2013
,
title
=
{Efficient Estimation of Word Representations in Vector Space}
,
author
=
{Tomas Mikolov and Kai Chen and Greg S. Corrado and Jeffrey Dean}
,
year
=
{2013}
,
URL
=
{http://arxiv.org/abs/1301.3781}
}
text_processing/text_processing.tex
View file @
f1439bb4
...
...
@@ -92,9 +92,9 @@
%\input{ie_plan.tex}
%\input{ie_introduction.tex} % 4th lecture + start 5th lecture
%\input{ie_ner.tex} % end of 5th lecture + 6th lecture
%\input{ie_relation_extraction.tex} % 7th lecture
%\input{ie_relation_extraction.tex} % 7th lecture
+ 8th lecture (shorter)
\input
{
textproc
_
nn.tex
}
\input
{
textproc
_
nn.tex
}
% 9th lecture + eventually 10th
\input
{
sa
_
extra
_
reading
}
...
...
text_processing/textproc_nn.tex
View file @
f1439bb4
...
...
@@ -82,7 +82,7 @@ Hebb: \myemph{``Neurons that fire together, wire together''}
\vspace
{
.5cm
}
Training method: change the weights
$
\vw
$
if a training example
$
\vx
$
is misclassified as follows:
\begin{itemize}
\item
[]
$
\
hat
{
\
vw
}
^{
new
}
=
\hat
{
\vw
}
^{
cur
}
+
\hat
{
\vx
}
. y
$
~~~ with ~~~
$
y
\in
\{
+
1
,
-
1
\}
$
\item
[]
$
\vw
^{
new
}
=
\vw
^{
cur
}
+
\vx
. y
$
~~~ with ~~~
$
y
\in
\{
+
1
,
-
1
\}
$
\end{itemize}
\end{frame}
...
...
@@ -136,39 +136,407 @@ y_i^{c} & = & f \left(\sum_j w^{c-1}_{ij} ~ y_j^{c-1}\right) \\
\frametitle
{
How to train a multilayer perceptron?
}
\begin{block}
{
\center
\textbf
{
Backpropagation
}}
\begin{block}
{
\center
\myemphb
{
Backpropagation
}
: Backward propagation of errors
}
%\begin{center}
\begin{columns}
\begin{column}
{
.5
\textwidth
}
\[
\wij
^{
new
}
=
\wij
^{
cur
}
-
\lambda
\frac
{
\partial
E
}{
\partial
\wij
}
\]
\end{column}
\begin{column}
{
.5
\textwidth
}
\begin{itemize}
\item
$
E
$
:
\textbf
{
loss function
}
\item
$
\lambda
$
:
\textbf
{
learning rate
}
\item
$
\wij
$
: weight between neuron
$
i
$
and
$
j
$
\end{itemize}
\end{column}
\end{columns}
%\end{center}
\end{block}
\begin{itemize}
\item
Error function depending on the task
\item
Classification task
\Ra\
estimate a probability distribution
\[
\begin
{
array
}
[
t
]
{
rcl@
{
\hspace
{
1
cm
}}
rcl
}
y
_
i
&
=
&
\ds
\frac
{
e
^{
a
_
i
}}{
\sum
_
k e
^{
a
_
k
}}
&
\ds
{
\partial
y
_
i
}
/
{
\partial
a
_
k
}
&
=
&
\delta
_{
ik
}
y
_
i
-
y
_
i y
_
k
\\
[
10
pt
]
\ds
E
(
\vy
,
\vc
)
&
=
&
\ds
\sum
_
i c
_
i
\log
y
_
i
&
\ds
{
\partial
E
}
/
{
\partial
y
_
i
}
&
=
&
\ds
\frac
{
c
_
i
}{
y
_
i
}
\end
{
array
}
\]
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
How to train a multilayer perceptron?
}
\begin{columns}
[c]
\begin{column}
{
.5
\textwidth
}
\begin{block}
{
\center
\myemphb
{
Chain rule
}}
\begin{center}
$
\ds
\frac
{
\partial
\mathbf
{
E
}}{
\partial
\mathbf
{
W
}}
=
\frac
{
\color
{
liumgreen
}
\partial
\mathbf
{
E
}}{
\color
{
edinorange
}
\partial
\mathbf
{
h
^{
2
}}}
\frac
{
\color
{
edinorange
}
\partial
\mathbf
{
h
^{
2
}}}{
\color
{
cyan
}
\partial
\mathbf
{
h
^{
1
}}}
\frac
{
\color
{
cyan
}
\partial
\mathbf
{
h
^{
1
}}}{
\partial
\mathbf
{
W
}}
$
\end{center}
\end{block}
\end{column}
\begin{column}
{
.5
\textwidth
}
\begin{center}
\includegraphics
[width=4cm]
{
mlp
_
bp
_
grad
}
\end{center}
\end{column}
\end{columns}
\textbf
{
Output layer
}
\[
\ds
\frac
{
\partial
E
}{
\partial
\wij
}
=
\ds
\underbrace
{
\frac
{
\partial
E
}{
\partial
a
_
i
}}_{
\delta
_
i
}
\,
\frac
{
\partial
a
_
i
}{
\partial
\wij
}
=
\delta
_
i
\,
h
_
j
\text
{
~~with~~
}
\delta
_
i
=
\ds
\frac
{
\partial
E
}{
\partial
y
_
i
}
\,
\frac
{
\partial
y
_
i
}{
\partial
a
_
i
}
=
\ds
\frac
{
\partial
E
}{
\partial
y
_
i
}
\,
f
^{
~'
}
(
a
_
i
)
\]
\textbf
{
Hidden layer
}
\[
\ds
\frac
{
\partial
E
}{
\partial
v
_{
jk
}}
=
\ds
\underbrace
{
\frac
{
\partial
E
}{
\partial
z
_
j
}}_{
\gamma
_
j
}
\,
\frac
{
\partial
z
_
j
}{
\partial
v
_{
jk
}}
=
\gamma
_
j
\,
x
_
k
\text
{
~~with~~
}
\gamma
_
j
=
\ds
\sum
_
i
\frac
{
\partial
E
}{
\partial
a
_
i
}
\,
\frac
{
\partial
a
_
i
}{
\partial
h
_
j
}
\,
\frac
{
\partial
h
_
j
}{
\partial
z
_
j
}
=
\ds
\sum
_
i
\delta
_
i
\,
\wij
\,
f
^{
~'
}
(
z
_
j
)
=
f
^{
~'
}
(
z
_
j
)
\ds
\sum
_
i
\delta
_
i
\wij
\]
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Multilayer perceptron: training
}
\begin{itemize}
\item
[1.]
Normalise data
\item
[2.]
Initialise the weights
$
\mW
$
\item
[3.]
\alert
{
Repeat
}
\begin{itemize}
\item
Pick a
\textbf
{
batch
}
of examples
$
(
\vx
,
\vc
)
$
\item
\textbf
{
Forward
}
pass: propagate the batch
$
\vx
$
through the network
\ra\
$
\vy
$
\item
Calculate the error
$
E
(
\vy
,
\vc
)
$
\item
\textbf
{
Backward
}
pass:
\myemphb
{
backpropagation
}
\ra\
$
\nabla
\wij
$
\item
Update weights
$
\wij
^{
new
}
=
\wij
^{
cur
}
-
\lambda
\frac
{
\partial
E
}{
\partial
\wij
}$
\item
Eventually change the training meta-parameters (e.g. learning rate
$
\lambda
$
)
\end{itemize}
\item
[ ]
\alert
{
until convergence
}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{}
\vfill
\centering
\Huge
{
\liumcyan
{
That's great, but where is the text?!?
}}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
How to represent words?
}
\begin{block}
{
\center
\myemphb
{
Word Embedding
}}
\begin{center}
\Large
{
Backward propagation of errors
}
Vector representation of a word
\Ra\
vector of real values
\\
\end{center}
\end{block}
Also called continuous space representation.
\begin{itemize}
\item
<2-> What would be the simplest way of obtaining vectors?
\only
<3->
{
\Ra\
so-called
\myemphb
{
1-hot vector
}
:
}
\item
[]
<3->
\begin{itemize}
\item
vector of size equal to
\textbf
{
vocabulary size
}
\item
contains 0 everywhere except for a single 1 at a specific position
\end{itemize}
\vspace
{
1cm
}
\item
<4-> Is that a good representation?
\only
<5->
{
\Ra\ \textbf
{
NO!
}}
\item
[]
<5->
\begin{itemize}
\item
distance between any two words is the same for all word pairs
\item
position of the "1" arbitrarily
\item
\ra\
it is just a
\textbf
{
coding
}
\end{itemize}
\end{itemize}
\only
<3->
{
\begin{textblock*}
{
50mm
}
[0,0](105mm,40mm)
\includegraphics
[width=4cm]
{
one-hot
}
\end{textblock*}
}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
How to represent words?
}
\myemph
{
The semantic properties of the words are encoded in the dimensions of the vector
}
\begin{minipage}
[t][.7
\textheight
]
{
\textwidth
}
\centering
\includegraphics
[width=.7\textwidth]
{
king-white-embedding
}
<1->
\includegraphics
[width=.7\textwidth]
{
king-colored-embedding
}
<2->
\includegraphics
[width=.4\textwidth]
{
queen-woman-girl-embeddings
}
<3->
\end{minipage}
\vfill
\source
{
\textbf
{
\url
{
http://jalammar.github.io/illustrated-word2vec/
}}
\la\ \myemphb
{
Must read!
}
}
\smallskip
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
How to represent words?
}
\myemph
{
The semantic properties of the words are encoded in the dimensions of the vector
}
\begin{center}
\includegraphics
[width=.5\textwidth]
{
king-analogy-viz
}
\end{center}
Can be learned in several ways:
\begin{itemize}
\item
Extract handcrafted meaningful features
\item
\myemph
{
Use a neural network!
}
<2->
\end{itemize}
\vfill
\source
{
\textbf
{
\url
{
http://jalammar.github.io/illustrated-word2vec/
}}
\la\ \myemphb
{
Must read!
}
}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Word embeddings: word2vec
}
Language modelling task: given a prefix (sequence of words), predict the next word
\begin{columns}
[c]
\begin{column}
{
.5
\textwidth
}
\begin{center}
\textbf
{
CBOW
}
\\
\includegraphics
[width=4cm]
{
cbow
}
\end{center}
\end{column}
\begin{column}
{
.5
\textwidth
}
\begin{center}
\textbf
{
SkipGram
}
\\
\includegraphics
[width=4cm]
{
skipgram
}
\end{center}
\end{column}
\end{columns}
\source
{
\textbf
{
\url
{
http://jalammar.github.io/illustrated-word2vec/
}}
\la\ \myemphb
{
Must read!
}
}
\source
{
Mikolov et al.
\textbf
{
Efficient Estimation of Word Representations in Vector Space
}
\cite
{
mikolov2013
}}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Why does it work?
}
\begin{center}
\includegraphics
[width=0.8\textwidth]
{
word
_
embeddings
}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Why does it work?
}
\begin{itemize}
\item
Let's assume that the word representations are
\myemph
{
organised semantically
}
\item
words
$
w
_
1
$
and
$
w
_
2
$
having similar meaning would be
\textbf
{
close to each other
}
in this space
\item
[]
\ra\
Consequently
$
\mathcal
{
F
}
(
w
_
1
)
\approx
\mathcal
{
F
}
(
w
_
2
)
$
\end{itemize}
\begin{columns}
[c]
\begin{column}
{
.5
\textwidth
}
\begin{itemize}
\item
[]
Language modelling:
\end{itemize}
\begin{enumerate}
\item
I have got
\edinred
{
10
}
\blue
{
euros
}
in my wallet
\item
This item costs
\liumgreen
{
11
}
\blue
{
euros
}
\item
In the U.S. it is
\liumgreen
{
11
}
\edinorange
{
dollars
}
!
\end{enumerate}
\end{column}
\begin{column}
{
.5
\textwidth
}
\begin{center}
\includegraphics
<1>[width=0.8
\textwidth
]
{
fflm
_
generalisation
}
\includegraphics
<2>[width=0.8
\textwidth
]
{
fflm
_
generalisation2
}
\end{center}
\end{column}
\end{columns}
\Ra\
What is the probability that
\edinred
{
10
}
is followed by
\edinorange
{
dollars
}
?
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
How to represent sentences?
}
Sentence = sequence of word
\Ra\
we need an
\myemphb
{
encoder
}
Several possibilities have been developed:
\begin{itemize}
\item
<2->
\myemph
{
Recurrent neural network
}
(RNN)
\begin{itemize}
\item
and its
\textbf
{
bidirectional
}
version
\item
representation = single vector or matrix
\item
[]
\end{itemize}
\item
<4->
\myemph
{
Convolutional neural network
}
(CNN)
\begin{itemize}
\item
produces a single vector representation
\item
[]
\end{itemize}
\item
<5-> Very recently
\myemph
{
Transformers
}
= self-attention
\begin{itemize}
\item
What error?
\Ra\
Error function depending on the task
\item
Estimating a real value:
\ra\
\item
representation = matrix (1 vector per word)
\item
Must read:
\textbf
{
\url
{
http://jalammar.github.io/illustrated-transformer/
}}
\end{itemize}
\end{itemize}
\begin{textblock*}
{
30mm
}
[0,0](110mm,0mm)
\includegraphics
<2>[height=4cm]
{
figures/rnn
_
proj
}
\includegraphics
<3->[height=4cm]
{
figures/rnn
_
proj2
}
\end{textblock*}
\begin{textblock*}
{
30mm
}
[0,0](110mm,35mm)
\includegraphics
<4->[height=0.25
\textheight
]
{
figures/conv
_
sent
_
encoder
}
\end{textblock*}
\begin{textblock*}
{
30mm
}
[0,0](110mm,35mm)
\includegraphics
<5->[height=0.25
\textheight
]
{
figures/conv
_
sent
_
encoder
}
\end{textblock*}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
How to classify sentences?
}
\begin{itemize}
\item
The classifier is a neural network implementing a complex function
$
\mathcal
{
F
}$
\begin{itemize}
\item
that operates in the
\textbf
{
continuous space
}
\item
that maps input vectors to a
\textbf
{
probability distribution
}
over the desired classes
\end{itemize}
\end{itemize}
\begin{enumerate}
\item
Encode the sentence
\begin{itemize}
\item
get a vector
\item
get a matrix (1 vector per word)
\ra\
compress into 1 vector
\begin{itemize}
\item
\textbf
{
pooling
}
operation (usually mean or max)
\item
concatenation
\end{itemize}
\end{itemize}
\item
Non-linear classification layer(s)
\ra\
get a vector of scores
$
\vz
$
(1 for each class)
\item
Get a probability distribution by normalization
\ra\
softmax
\begin{itemize}
\item
[]
\begin{center}
$
p
(
\vc
=
j |
\theta
)
=
\ds
\frac
{
e
^{
\vz
_
j
}}{
\ds
\sum
_{
k
=
1
}^{
\|
V
\|
}
e
^{
\vz
_
k
}}$
\end{center}
\end{itemize}
\end{enumerate}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Encoding a sentence with a (bi-)RNN
}
Sentence: "
\textbf
{
A long time ago in a galaxy far , far away
}
"
\begin{center}
% \includegraphics[height=0.8\textheight]{figures/rnn_seq_1}<+>% if you remove the '%' then the
% \includegraphics[height=0.8\textheight]{figures/rnn_seq_2}<+>%
% \includegraphics[height=0.8\textheight]{figures/rnn_seq_3}<+>%
% \includegraphics[height=0.8\textheight]{figures/rnn_seq_7}<+>%
% \includegraphics[height=0.8\textheight]{figures/rnn_seq_10}<+>%
% \includegraphics[height=0.8\textheight]{figures/rnn_seq_all}<+>%
\includegraphics
[height=0.8\textheight]
{
figures/bi
_
rnn
_
seq
_
1
}
<+>
%
\includegraphics
[height=0.8\textheight]
{
figures/bi
_
rnn
_
seq
_
2
}
<+>
%
\includegraphics
[height=0.8\textheight]
{
figures/bi
_
rnn
_
seq
_
7
}
<+>
%
\includegraphics
[height=0.8\textheight]
{
figures/bi
_
rnn
_
seq
_
fall
}
<+>
%
\includegraphics
[height=0.8\textheight]
{
figures/bi
_
rnn
_
seq
_
r1
}
<+>
%
\includegraphics
[height=0.8\textheight]
{
figures/bi
_
rnn
_
seq
_
r2
}
<+>
%
\includegraphics
[height=0.8\textheight]
{
figures/bi
_
rnn
_
seq
_
r3
}
<+>
%
\includegraphics
[height=0.8\textheight]
{
figures/bi
_
rnn
_
seq
_
rall
}
<+>
%
\includegraphics
[height=0.8\textheight]
{
figures/bi
_
rnn
_
seq
_
all
}
<+>
%
\end{center}
%centering
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Pooling operation
}
Compute the feature-wise
\myemph
{
average
}
or
\myemph
{
maximum
}
\textbf
{
activation
}
of a set of vectors
\\
Aim: sub-sampling
\ra\
result is a vector!
\begin{center}
\includegraphics
[height=0.5\textheight]
{
figures/pooling
}
%
\end{center}
\source
{
A comment on max pooling to read:
\url
{
https://mirror2image.wordpress.com/2014/11/11/geoffrey-hinton-on-max-pooling-reddit-ama/
}}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle
{
Classification layer
\Ra\
Softmax
}
Get a probability distribution by normalization
\ra\
softmax:
$
p
(
\vc
=
j |
\theta
)
=
\ds
\frac
{
e
^{
\vz
_
j
}}{
\ds
\sum
_{
k
=
1
}^{
\|
V
\|
}
e
^{
\vz
_
k
}}$
\begin{center}
\includegraphics
[height=0.6\textheight]
{
figures/classif
_
layer
}
%
\end{center}
\end{frame}
...
...
@@ -188,11 +556,9 @@ y_i^{c} & = & f \left(\sum_j w^{c-1}_{ij} ~ y_j^{c-1}\right) \\
\myemph
{
Project
}
or represent the
\textbf
{
text
}
into a
\myemph
{
continuous space
}
and train an estimator operating into this space to compute the probability of the sentiment.
\end{block}
Basically it is like:
%\includegraphics[width=0.75\textwidth]{sa_nn}
\begin{center}
\includegraphics
[height=0.6\textheight]
{
sa
_
nn
}
\end{center}
\end{frame}
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment